From c8e465d059be925d1dc274c4bce5d68f3622b5f1 Mon Sep 17 00:00:00 2001
From: maydomine <1583143678@qq.com>
Date: Mon, 29 May 2023 18:06:47 +0800
Subject: [PATCH 01/13] modify the version check to make it work with torch2.0

---
 bmtrain/optim/adam.py         |  4 ++--
 bmtrain/optim/adam_offload.py |  4 ++--
 bmtrain/utils.py              | 12 ++++++++++++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/bmtrain/optim/adam.py b/bmtrain/optim/adam.py
index 689b151e..dda46f18 100644
--- a/bmtrain/optim/adam.py
+++ b/bmtrain/optim/adam.py
@@ -4,7 +4,7 @@
 from . import _cuda as C
 from .. import nccl
 import inspect
-
+from ..utils import check_torch_version
 from copy import deepcopy
 from itertools import chain
 from collections import defaultdict
@@ -110,7 +110,7 @@ def step(self, closure=None, scale=1):
                             [state['exp_avg']],
                             [state["exp_avg_sq"]],
                             [],
-                            [state["step"]] if int(torch.__version__.split('.')[1]) < 12
+                            [state["step"]] if check_torch_version("1.12.0") < 0 
                                 else [torch.tensor(state["step"])],
                             amsgrad=False,
                             beta1=group['betas'][0],
diff --git a/bmtrain/optim/adam_offload.py b/bmtrain/optim/adam_offload.py
index 1b337268..7e80b866 100644
--- a/bmtrain/optim/adam_offload.py
+++ b/bmtrain/optim/adam_offload.py
@@ -5,7 +5,7 @@
 from .. import nccl
 import torch.optim._functional as F
 import inspect
-
+from ..utils import check_torch_version
 from copy import deepcopy
 from itertools import chain
 from collections import defaultdict
@@ -136,7 +136,7 @@ def step(self, closure=None, scale=1):
                     [state["exp_avg"]],
                     [state["exp_avg_sq"]],
                     [],
-                    [state["step"]] if int(torch.__version__.split('.')[1]) < 12
+                    [state["step"]] if check_torch_version("1.12.0") < 0
                         else [torch.tensor(state["step"])],
                     amsgrad=False,
                     beta1=beta1,
diff --git a/bmtrain/utils.py b/bmtrain/utils.py
index 2df9a3a5..391b5a8f 100644
--- a/bmtrain/utils.py
+++ b/bmtrain/utils.py
@@ -5,6 +5,18 @@
 
 ALIGN = 4
 ROW_WIDTH = 60
+def check_torch_version(version_str):
+    """
+    Checks if the current torch version is greater than or equal to the given version.
+    version_str (str): The version to compare with, in the format of "x.y.z" ,and the func will convert it into a int value of x*100+y*10+z.
+    """
+    version_int_arr = [int(v) for v in version_str.split(".")]
+
+    version_int = version_int_arr[0] * 10000 + version_int_arr[1] * 100 + version_int_arr[2]
+    torch_version = torch.__version__.split("+")[0]
+    current_version_int_arr = [int(v) for v in torch_version.split(".")]
+    current_version_int = current_version_int_arr[0] * 10000 + current_version_int_arr[1] * 100 + current_version_int_arr[2]
+    return current_version_int - version_int
 
 def round_up(x, d):
     return (x + d - 1) // d * d

From 41612bc9420dc157f7e2215abcd3dc8aca137bba Mon Sep 17 00:00:00 2001
From: maydomine <1583143678@qq.com>
Date: Wed, 31 May 2023 20:13:27 +0800
Subject: [PATCH 02/13] torch 2.0 named_members interface && fix pipeline
 communication bug

---
 bmtrain/block_layer.py     |  2 +-
 bmtrain/distributed/ops.py | 13 ++++++++++---
 bmtrain/init.py            |  2 ++
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/bmtrain/block_layer.py b/bmtrain/block_layer.py
index 5269673b..2210c297 100644
--- a/bmtrain/block_layer.py
+++ b/bmtrain/block_layer.py
@@ -617,7 +617,7 @@ def init_parameters(self):
                     torch.tensor([], dtype=d_dtype, device=d_device).set_(tmp_tensor.storage(), offset_st, (offset_end - offset_st,))[:]
                 del tmp_tensor
         
-    def _named_members(self, get_members_fn, prefix='', recurse=True):
+    def _named_members(self, get_members_fn, prefix='', recurse=True, remove_duplicate=True):
         r"""Helper method for yielding various names + members of modules."""
         return self._module._named_members(get_members_fn, prefix, recurse)
         
diff --git a/bmtrain/distributed/ops.py b/bmtrain/distributed/ops.py
index 6124622d..ef69659a 100644
--- a/bmtrain/distributed/ops.py
+++ b/bmtrain/distributed/ops.py
@@ -20,22 +20,29 @@
 def send_activations(hidden_state, next_rank, comm):
     send_meta(hidden_state, next_rank, comm)
     ncclSend(hidden_state.storage(), next_rank, comm)
+
 def recv_activations(prev_rank, comm):
     dtype, shape = recv_meta(prev_rank, comm)
     hidden_state = torch.empty(shape, dtype=dtype, device="cuda")
     ncclRecv(hidden_state.storage(), prev_rank, comm)
     return hidden_state
+
 def send_meta(x, next_rank, comm):
-    meta = [len(x.size()), DTYPE_LIST.index(x.dtype)] + list(x.size())
-    meta_data = torch.tensor(data=meta, device=x.device, dtype=torch.long)
+    meta_data = torch.tensor(data=[0]*50, device="cuda", dtype=torch.int)
+    meta_data[0] = len(x.size())
+    meta_data[1] = DTYPE_LIST.index(x.dtype)
+    meta_data[2:len(x.size())+2] = torch.tensor(x.size(), device="cuda", dtype=torch.int)
+    meta_data = meta_data.contiguous()
     ncclSend(meta_data.storage(), next_rank, comm)
+
 def recv_meta(prev_rank, comm):
-    meta_data = torch.tensor(data=[0]*50, device="cuda", dtype=torch.long)
+    meta_data = torch.tensor(data=[0]*50, device="cuda", dtype=torch.int)
     ncclRecv(meta_data.storage(), prev_rank, comm)
     n_dims = meta_data[0].item()
     dtype = DTYPE_LIST[meta_data[1].item()]
     shape = meta_data[2:n_dims+2].tolist()
     return dtype,shape
+
 class OpBroadcast(torch.autograd.Function):
     @staticmethod
     def forward(ctx, src, root, comm = None):
diff --git a/bmtrain/init.py b/bmtrain/init.py
index b98064b7..5c3006d2 100644
--- a/bmtrain/init.py
+++ b/bmtrain/init.py
@@ -149,11 +149,13 @@ def __init__(self,config):
         self.prev_rank = self.stage_id-1 if self.stage_id > 0 else -1
         self.tails = self.pp_group[self.pipe_idx, self.stage_id:].tolist()
         self.heads = self.pp_group[self.pipe_idx, :self.stage_id + 1].tolist()
+
     def get_group_id(self,group_name):
         if group_name == "pipe":
             return self.pipe_idx
         elif group_name == "zero":
             return self.zero_idx
+        
     def get_group_rank(self,group_name):
         if group_name == "pipe":
             return self.stage_id

From bb7381e8c757e2404dae01dd3a4235a640fad4b4 Mon Sep 17 00:00:00 2001
From: maydomine <1583143678@qq.com>
Date: Thu, 1 Jun 2023 14:21:41 +0800
Subject: [PATCH 03/13] nccl header update && nccl cpp extension has no
 denpendcy of torch now

---
 csrc/include/nccl.h     |  55 ++++++++--
 csrc/include/nccl_net.h | 217 +++++++++++++++++++++++++++++++++++++---
 csrc/nccl.cpp           |   6 +-
 setup.py                | 146 ++++++++++++++++++++++++++-
 4 files changed, 397 insertions(+), 27 deletions(-)

diff --git a/csrc/include/nccl.h b/csrc/include/nccl.h
index 4bdda76a..90f721c7 100644
--- a/csrc/include/nccl.h
+++ b/csrc/include/nccl.h
@@ -14,11 +14,11 @@
 #endif
 
 #define NCCL_MAJOR 2
-#define NCCL_MINOR 11
-#define NCCL_PATCH 4
+#define NCCL_MINOR 14
+#define NCCL_PATCH 3
 #define NCCL_SUFFIX ""
 
-#define NCCL_VERSION_CODE 21104
+#define NCCL_VERSION_CODE 21403
 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
 
 #ifdef __cplusplus
@@ -38,7 +38,29 @@ typedef enum { ncclSuccess                 =  0,
                ncclInternalError           =  3,
                ncclInvalidArgument         =  4,
                ncclInvalidUsage            =  5,
-               ncclNumResults              =  6 } ncclResult_t;
+               ncclRemoteError             =  6,
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+
+/* Communicator configuration. Users can assign value to attributes to specify the
+ * behavior of a communicator. */
+typedef struct ncclConfig_v21400 {
+  /* attributes that users should never touch. */
+  size_t size;
+  unsigned int magic;
+  unsigned int version;
+  /* attributes that users are able to customize. */
+  int blocking;
+} ncclConfig_t;
+
+/* Config initializer must be assigned to initialize config structure when it is created.
+ * Not initialized config will result in NCCL error. */
+#define NCCL_CONFIG_INITIALIZER {                                       \
+  sizeof(ncclConfig_t), /* size */                                      \
+  0xcafebeef,           /* magic */                                     \
+  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
+  1                     /* blocking */                                  \
+}
 
 /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
  * This integer is coded with the MAJOR, MINOR and PATCH level of the
@@ -53,6 +75,11 @@ ncclResult_t pncclGetVersion(int *version);
 ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
 ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
 
+/* Create a new communicator (multi thread/process version) with a configuration
+ * set by users. */
+ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+
 /* Creates a new communicator (multi thread/process version).
  * rank must be between 0 and nranks-1 and unique within a communicator clique.
  * Each rank is associated to a CUDA device, which has to be set before calling
@@ -71,8 +98,15 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId
 ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
 
-/* Frees resources associated with communicator object, but waits for any operations
- * that might still be running on the device. */
+/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+ * when the communicator is globally quiescent and related resources are freed; then,
+ * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
+ * itself) without blocking. */
+ncclResult_t  ncclCommFinalize(ncclComm_t comm);
+ncclResult_t pncclCommFinalize(ncclComm_t comm);
+
+/* Frees local resources associated with communicator object. */
 ncclResult_t  ncclCommDestroy(ncclComm_t comm);
 ncclResult_t pncclCommDestroy(ncclComm_t comm);
 
@@ -81,10 +115,16 @@ ncclResult_t pncclCommDestroy(ncclComm_t comm);
 ncclResult_t  ncclCommAbort(ncclComm_t comm);
 ncclResult_t pncclCommAbort(ncclComm_t comm);
 
-/* Returns a human-readable error message. */
+/* Returns a string for each error code. */
 const char*  ncclGetErrorString(ncclResult_t result);
 const char* pncclGetErrorString(ncclResult_t result);
 
+/* Returns a human-readable message of the last error that occurred.
+ * comm is currently unused and can be set to NULL
+ */
+const char*  ncclGetLastError(ncclComm_t comm);
+const char* pncclGetError(ncclComm_t comm);
+
 /* Checks whether the comm has encountered any asynchronous errors */
 ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
 ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
@@ -351,7 +391,6 @@ ncclResult_t pncclGroupStart();
 ncclResult_t  ncclGroupEnd();
 ncclResult_t pncclGroupEnd();
 
-
 #ifdef __cplusplus
 } // end extern "C"
 #endif
diff --git a/csrc/include/nccl_net.h b/csrc/include/nccl_net.h
index 389c1eaa..255a44ee 100644
--- a/csrc/include/nccl_net.h
+++ b/csrc/include/nccl_net.h
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/
@@ -10,16 +10,17 @@
 #include "nccl.h"
 #include <stdint.h>
 
-#define NCCL_NET_HANDLE_MAXSIZE 64
+#define NCCL_NET_HANDLE_MAXSIZE 128
 
 #define NCCL_PTR_HOST 0x1
 #define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
 
 // Maximum number of requests per comm object
 #define NCCL_NET_MAX_REQUESTS 8
 
 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys;
+typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
 
 typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
 
@@ -28,14 +29,209 @@ typedef struct {
   char* pciPath;  // Path to the PCI device in /sys.
   uint64_t guid;  // Unique identifier for the NIC chip. Important for
                   // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
   int speed;      // Port speed in Mbps.
   int port;       // Port number.
+  float latency;  // Network latency
   int maxComms;   // Maximum number of comms we can create
-}ncclNetProperties_v4_t;
+  int maxRecvs;   // Maximum number of grouped receives.
+}ncclNetProperties_v6_t;
+
+typedef ncclNetProperties_v6_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef ncclNet_v6_t ncclNet_t;
+
+#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
 
-typedef ncclNetProperties_v4_t ncclNetProperties_t;
+typedef ncclCollNet_v6_t ncclCollNet_t;
 
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v5_t;
+
+// v4 struct for backwards compatibility
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  int maxComms;   // Maximum number of comms we can create
+} ncclNetProperties_v4_t;
+
+// v4 struct for backwards compatibility
 typedef struct {
   // Name of the network (mainly for logs)
   const char* name;
@@ -75,10 +271,7 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclNet_v4_t;
 
-typedef ncclNet_v4_t ncclNet_t;
-
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4
-
+// v4 struct for backwards compatibility
 typedef struct {
   // Name of the collective network (mainly for logs)
   const char* name;
@@ -117,8 +310,4 @@ typedef struct {
   ncclResult_t (*closeListen)(void* listenComm);
 } ncclCollNet_v4_t;
 
-typedef ncclCollNet_v4_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4
-
 #endif // end include guard
diff --git a/csrc/nccl.cpp b/csrc/nccl.cpp
index 2d0ac8de..c1dbb321 100644
--- a/csrc/nccl.cpp
+++ b/csrc/nccl.cpp
@@ -1,7 +1,9 @@
+#include <cstdint>
+#include <string>
+#include <pybind11/pybind11.h>
 
-#include <torch/extension.h>
+namespace py = pybind11;
 #include "include/nccl.h"
-#include <ATen/cuda/CUDAContext.h>
 
 void checkNCCLStatus(ncclResult_t result) {
     if (result == ncclSuccess) return;
diff --git a/setup.py b/setup.py
index 7c29a10f..b7411f4c 100644
--- a/setup.py
+++ b/setup.py
@@ -1,7 +1,147 @@
 from setuptools import setup, find_packages
 import torch
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension
+from setuptools import Extension
+from setuptools.command.build_ext import build_ext
 import os
+import os
+import setuptools
+import subprocess
+from pkg_resources import packaging
+from typing import Dict, List, Optional, Union, Tuple
+def CppExtension(name, sources, *args, **kwargs):
+    include_dirs = kwargs.get('include_dirs', [])
+    include_dirs += include_paths()
+    kwargs['include_dirs'] = include_dirs
+
+    library_dirs = kwargs.get('library_dirs', [])
+    library_dirs += library_paths()
+    kwargs['library_dirs'] = library_dirs
+
+    libraries = kwargs.get('libraries', [])
+    kwargs['libraries'] = libraries
+
+    kwargs['language'] = 'c++'
+    return setuptools.Extension(name, sources, *args, **kwargs)
+
+
+def CUDAExtension(name, sources, *args, **kwargs):
+    library_dirs = kwargs.get('library_dirs', [])
+    library_dirs += library_paths(cuda=True)
+    kwargs['library_dirs'] = library_dirs
+
+    libraries = kwargs.get('libraries', [])
+    kwargs['libraries'] = libraries
+
+    include_dirs = kwargs.get('include_dirs', [])
+
+
+    include_dirs += include_paths(cuda=True)
+    kwargs['include_dirs'] = include_dirs
+
+    kwargs['language'] = 'c++'
+
+    dlink_libraries = kwargs.get('dlink_libraries', [])
+    dlink = kwargs.get('dlink', False) or dlink_libraries
+    if dlink:
+        extra_compile_args = kwargs.get('extra_compile_args', {})
+
+        extra_compile_args_dlink = extra_compile_args.get('nvcc_dlink', [])
+        extra_compile_args_dlink += ['-dlink']
+        extra_compile_args_dlink += [f'-L{x}' for x in library_dirs]
+        extra_compile_args_dlink += [f'-l{x}' for x in dlink_libraries]
+
+
+        extra_compile_args['nvcc_dlink'] = extra_compile_args_dlink
+
+        kwargs['extra_compile_args'] = extra_compile_args
+
+    return setuptools.Extension(name, sources, *args, **kwargs)
+
+def _find_cuda_home() -> Optional[str]:
+    r'''Finds the CUDA install path.'''
+    # Guess #1
+    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+    if cuda_home is None:
+        # Guess #2
+        try:
+            which = 'which'
+            with open(os.devnull, 'w') as devnull:
+                nvcc = subprocess.check_output([which, 'nvcc'],
+                                               stderr=devnull).decode(*SUBPROCESS_DECODE_ARGS).rstrip('\r\n')
+                cuda_home = os.path.dirname(os.path.dirname(nvcc))
+        except Exception:
+            #Guess #3
+            cuda_home = '/usr/local/cuda'
+            if not os.path.exists(cuda_home):
+                cuda_home = None
+    return cuda_home
+
+CUDA_HOME = _find_cuda_home()
+_BMT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+def _join_cuda_home(*paths) -> str:
+    if CUDA_HOME is None:
+        raise EnvironmentError('CUDA_HOME environment variable is not set. '
+                               'Please set it to your CUDA install root.')
+    return os.path.join(CUDA_HOME, *paths)
+
+def CppExtension(name, sources, *args, **kwargs):
+    library_dirs = kwargs.get('library_dirs', [])
+    library_dirs += library_paths(cuda=True)
+    kwargs['library_dirs'] = library_dirs
+
+    libraries = kwargs.get('libraries', [])
+    libraries.append('cudart')
+    kwargs['libraries'] = libraries
+
+    include_dirs = kwargs.get('include_dirs', [])
+
+    include_dirs += include_paths(cuda=True)
+    kwargs['include_dirs'] = include_dirs
+
+    kwargs['language'] = 'c++'
+
+    dlink_libraries = kwargs.get('dlink_libraries', [])
+    dlink = kwargs.get('dlink', False) or dlink_libraries
+    if dlink:
+        extra_compile_args = kwargs.get('extra_compile_args', {})
+
+        extra_compile_args_dlink = extra_compile_args.get('nvcc_dlink', [])
+        extra_compile_args_dlink += ['-dlink']
+        extra_compile_args_dlink += [f'-L{x}' for x in library_dirs]
+        extra_compile_args_dlink += [f'-l{x}' for x in dlink_libraries]
+
+        extra_compile_args['nvcc_dlink'] = extra_compile_args_dlink
+
+        kwargs['extra_compile_args'] = extra_compile_args
+
+    return setuptools.Extension(name, sources, *args, **kwargs)
+
+
+def include_paths(cuda: bool = False) -> List[str]:
+    paths = [
+        os.path.join(_BMT_PATH, 'csrc', 'include'),
+    ]
+    if cuda:
+        cuda_home_include = _join_cuda_home('include')
+        # if we have the Debian/Ubuntu packages for cuda, we get /usr as cuda home.
+        # but gcc doesn't like having /usr/include passed explicitly
+        if cuda_home_include != '/usr/include':
+            paths.append(cuda_home_include)
+    return paths
+
+
+def library_paths(cuda: bool = False) -> List[str]:
+    paths = []
+
+    if cuda:
+        lib_dir = 'lib64'
+        if (not os.path.exists(_join_cuda_home(lib_dir)) and
+                os.path.exists(_join_cuda_home('lib'))):
+            lib_dir = 'lib'
+
+        paths.append(_join_cuda_home(lib_dir))
+    return paths
 
 def get_avx_flags():
     if os.environ.get("BMT_AVX256", "").lower() in ["1", "true", "on"]:
@@ -51,7 +191,7 @@ def get_device_cc():
     ext_modules = [
         CUDAExtension('bmtrain.nccl._C', [
             'csrc/nccl.cpp',
-        ], include_dirs=["csrc/nccl/build/include"], extra_compile_args={}),
+        ], include_dirs=["csrc/nccl/build/include"], extra_compile_args={}, extra_link_args=["../libnccl_static.a"]),
         CUDAExtension('bmtrain.optim._cuda', [
             'csrc/adam_cuda.cpp',
             'csrc/cuda/adam.cu',
@@ -83,6 +223,6 @@ def get_device_cc():
     ],
     ext_modules=ext_modules,
     cmdclass={
-        'build_ext': BuildExtension
+        'build_ext': build_ext
     })
 

From 0200974299df4f0df0428efc259ff29215567ad5 Mon Sep 17 00:00:00 2001
From: maydomine <1583143678@qq.com>
Date: Mon, 12 Jun 2023 14:42:02 +0800
Subject: [PATCH 04/13] use cmake to simpify the installition and better
 support for torch 2.0

---
 CMakeLists.txt                              |  52 ++++
 bmtrain/block_layer.py                      |   9 +-
 bmtrain/loss/_function.py                   |  86 ++++++
 bmtrain/loss/cross_entropy.py               |  11 +-
 bmtrain/nccl/__init__.py                    |   2 +-
 bmtrain/optim/_function.py                  |  80 ++++++
 bmtrain/optim/adam.py                       |  11 +-
 bmtrain/optim/adam_offload.py               |  10 +-
 bmtrain/optim/optim_manager.py              |   4 +-
 cmake/FindNCCL.cmake                        |  75 +++++
 csrc/adam_cuda.cpp                          |  58 ----
 csrc/bind.cpp                               |  25 ++
 csrc/cross_entropy_loss.cpp                 |  98 -------
 csrc/cuda/{adam.cu => adam_cuda.cu}         |  31 +-
 csrc/cuda/cross_entropy.cu                  |  94 +++---
 csrc/cuda/has_inf_nan.cu                    |  23 +-
 csrc/cuda/reduce.cuh                        |   2 -
 csrc/{adam_cpu.cpp => include/adam_cpu.hpp} | 178 ++++++++----
 csrc/include/bind.hpp                       |  55 ++++
 csrc/{nccl.cpp => include/nccl.hpp}         |  18 +-
 setup.py                                    | 302 ++++++--------------
 21 files changed, 687 insertions(+), 537 deletions(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 bmtrain/loss/_function.py
 create mode 100644 bmtrain/optim/_function.py
 create mode 100644 cmake/FindNCCL.cmake
 delete mode 100644 csrc/adam_cuda.cpp
 create mode 100644 csrc/bind.cpp
 delete mode 100644 csrc/cross_entropy_loss.cpp
 rename csrc/cuda/{adam.cu => adam_cuda.cu} (62%)
 rename csrc/{adam_cpu.cpp => include/adam_cpu.hpp} (57%)
 create mode 100644 csrc/include/bind.hpp
 rename csrc/{nccl.cpp => include/nccl.hpp} (83%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000..a00f3f6d
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,52 @@
+cmake_minimum_required(VERSION 3.18)
+project(bmtrain)
+enable_language(C)
+enable_language(CXX)
+set(CMAKE_CUDA_ARCHITECTURES "61;62;70;72;75;80")
+enable_language(CUDA)
+
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD_REQUIRED True)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86")
+set(CMAKE_BUILD_RPATH $ORIGIN)
+set(CMAKE_INSTALL_RPATH $ORIGIN)
+set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/)
+
+find_package(NCCL REQUIRED)
+find_package(Python ${PYTHON_VERSION} EXACT COMPONENTS Interpreter Development.Module REQUIRED)
+message (STATUS "Python_EXECUTABLE: ${Python_EXECUTABLE}")
+execute_process(COMMAND ${Python_EXECUTABLE} "-c"
+    "import pybind11; print(pybind11.get_cmake_dir())"
+    OUTPUT_VARIABLE PYBIND11_CMAKE_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+message (STATUS "PYBIND11_CMAKE_DIR: ${PYBIND11_CMAKE_DIR}")
+list(APPEND CMAKE_PREFIX_PATH ${PYBIND11_CMAKE_DIR})
+find_package(pybind11 REQUIRED)
+
+message (STATUS "CMAKE_INSTALL_RPATH: ${CMAKE_INSTALL_RPATH}")
+
+file(GLOB_RECURSE SOURCES "csrc/*.cpp")
+file(GLOB_RECURSE CUDA_SOURCES "csrc/cuda/*.cu")
+
+pybind11_add_module(C ${SOURCES} ${CUDA_SOURCES})
+target_link_libraries(C PRIVATE
+    "-Wl,-Bsymbolic"
+    "-Wl,-Bsymbolic-functions"
+    ${NCCL_LIBRARIES}
+)
+target_include_directories(C PRIVATE ${NCCL_INCLUDE_DIRS})
+target_compile_definitions(C
+    PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO})
+
+set_target_properties(C PROPERTIES CUDA_ARCHITECTURES "61;62;70;72;75;80;86")
+
+target_include_directories(C
+    PRIVATE "csrc/include"
+    PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+)
+
+#STATIC ${bmengine_STATIC_LIBRARIES}
+
+
diff --git a/bmtrain/block_layer.py b/bmtrain/block_layer.py
index 2210c297..652e085a 100644
--- a/bmtrain/block_layer.py
+++ b/bmtrain/block_layer.py
@@ -9,6 +9,7 @@
 from .checkpointing import ScopedTensorInspectorContext
 from . import debug
 import copy
+import inspect
 
 
 # the flag is used to control the zero level , 0 means normal zero3 , 1 means forward without release parameter ,2 means backward without gather parameter
@@ -617,10 +618,14 @@ def init_parameters(self):
                     torch.tensor([], dtype=d_dtype, device=d_device).set_(tmp_tensor.storage(), offset_st, (offset_end - offset_st,))[:]
                 del tmp_tensor
         
-    def _named_members(self, get_members_fn, prefix='', recurse=True, remove_duplicate=True):
+    def _named_members(self, get_members_fn, prefix='', recurse=True, **kwargs):
         r"""Helper method for yielding various names + members of modules."""
-        return self._module._named_members(get_members_fn, prefix, recurse)
         
+        #compitibity with torch 2.0
+        if "remove_duplicate" in inspect.signature(torch.nn.Module._named_members).parameters and "remove_duplicate" not in kwargs:
+            kwargs['remove_duplicate'] = True
+        return self._module._named_members(get_members_fn, prefix, recurse, **kwargs)
+    
     def named_modules(self, memo = None, prefix: str = '', remove_duplicate: bool = True):
         r"""Returns an iterator over all modules in the network, yielding
         both the name of the module as well as the module itself.
diff --git a/bmtrain/loss/_function.py b/bmtrain/loss/_function.py
new file mode 100644
index 00000000..658ef242
--- /dev/null
+++ b/bmtrain/loss/_function.py
@@ -0,0 +1,86 @@
+
+from .. import C 
+import torch
+CHECK_INPUT = lambda x: x.is_contiguous() and x.is_cuda
+def has_inf_nan(g_fp16: torch.Tensor, out: torch.Tensor) -> None:
+    assert g_fp16.dtype == torch.float16, "g_fp16 must be a half tensor"
+    assert out.dtype == torch.uint8, "out must be a uint8 tensor"
+    assert CHECK_INPUT(g_fp16), "g_fp16 must be contiguous and on cuda"
+    assert CHECK_INPUT(out), "out must be contiguous and on cuda"
+    mid = torch.zeros(1024, device=out.device, dtype=out.dtype)
+    stream = torch.cuda.current_stream().cuda_stream
+    C.has_nan_inf_launcher(g_fp16.numel(), g_fp16.data_ptr(), mid.data_ptr(), out.data_ptr(), stream)
+
+
+
+def cross_entropy_forward(m: int, n: int, input: torch.Tensor, target: torch.Tensor,
+                            softmax: torch.Tensor, output: torch.Tensor, ignore_index: int) -> None:
+    CHECK_INPUT(input)
+    CHECK_INPUT(target)
+    CHECK_INPUT(softmax)
+    CHECK_INPUT(output)
+    assert input.dtype == torch.float16, "input must be a half tensor"
+    assert target.dtype == torch.int32, "target must be an int tensor"
+    assert softmax.dtype == torch.float16, "softmax must be a half tensor"
+    assert output.dtype == torch.float32, "output must be a float tensor"
+    assert input.numel() == softmax.numel(), "input and softmax must have the same number of elements"
+    assert target.numel() == output.numel(), "target and output must have the same number of elements"
+    input_ptr = input.data_ptr()
+    target_ptr = target.data_ptr()
+    softmax_ptr = softmax.data_ptr()
+    output_ptr = output.data_ptr()
+    cuda_stream = torch.cuda.current_stream().cuda_stream
+    C.cross_entropy_forward_launcher(m, n, input_ptr, target_ptr, softmax_ptr, output_ptr, ignore_index, cuda_stream)
+
+def cross_entropy_backward(m: int, n: int, grad_output: torch.Tensor, target: torch.Tensor,
+                             softmax: torch.Tensor, grad_input: torch.Tensor, ignore_index: int) -> None:
+    CHECK_INPUT(grad_output)
+    CHECK_INPUT(target)
+    CHECK_INPUT(softmax)
+    CHECK_INPUT(grad_input)
+    assert grad_output.dtype == torch.float32, "grad_output must be a float tensor"
+    assert target.dtype == torch.int32, "target must be an int tensor"
+    assert softmax.dtype == torch.float16, "softmax must be a half tensor"
+    assert grad_input.dtype == torch.float16, "grad_input must be a half tensor"
+    assert grad_input.numel() == softmax.numel(), "grad_input and softmax must have the same number of elements"
+    assert target.numel() == grad_output.numel(), "target and grad_output must have the same number of elements"
+    grad_output_ptr = grad_output.data_ptr()
+    target_ptr = target.data_ptr()
+    softmax_ptr = softmax.data_ptr()
+    grad_input_ptr = grad_input.data_ptr()
+    cuda_stream = torch.cuda.current_stream().cuda_stream
+    C.cross_entropy_backward_launcher(m, n, grad_output_ptr, target_ptr, softmax_ptr, grad_input_ptr, ignore_index, cuda_stream)
+
+def cross_entropy_forward_inplace(m: int, n: int, x: torch.Tensor, target: torch.Tensor,
+                                    output: torch.Tensor, ignore_index: int) -> None:
+    CHECK_INPUT(x)
+    CHECK_INPUT(target)
+    CHECK_INPUT(output)
+    assert x.dtype == torch.float16, "x must be a half tensor"
+    assert target.dtype == torch.int32, "target must be an int tensor"
+    assert output.dtype == torch.float32, "output must be a float tensor"
+    assert target.numel() == output.numel(), "target and output must have the same number of elements"
+    cuda_stream = torch.cuda.current_stream().cuda_stream
+    x_ptr = x.data_ptr()
+    output_ptr = output.data_ptr()
+    target_ptr = target.data_ptr()
+    output_ptr = output.data_ptr()
+    
+    C.cross_entropy_forward_inplace_launcher(m, n, x_ptr, target_ptr, output_ptr, ignore_index, cuda_stream)
+
+def cross_entropy_backward_inplace(m: int, n: int, grad_output: torch.Tensor, target: torch.Tensor,
+                                     x: torch.Tensor, ignore_index: int) -> None:
+    CHECK_INPUT(grad_output)
+    CHECK_INPUT(target)
+    CHECK_INPUT(x)
+    assert grad_output.dtype == torch.float32, "grad_output must be a float tensor"
+    assert target.dtype == torch.int32, "target must be an int tensor"
+    assert x.dtype == torch.float16, "x must be a half tensor"
+    assert target.numel() == grad_output.numel(), "target and grad_output must have the same number of elements"
+    cuda_stream = torch.cuda.current_stream().cuda_stream
+    grad_output_ptr = grad_output.data_ptr()
+    target_ptr = target.data_ptr()
+    x_ptr = x.data_ptr()
+
+    C.cross_entropy_backward_inplace_launcher(m, n, grad_output_ptr, target_ptr, x_ptr, ignore_index, cuda_stream)
+
diff --git a/bmtrain/loss/cross_entropy.py b/bmtrain/loss/cross_entropy.py
index 1fb37181..160ef421 100644
--- a/bmtrain/loss/cross_entropy.py
+++ b/bmtrain/loss/cross_entropy.py
@@ -1,7 +1,6 @@
 from typing import Optional
 import torch
-from . import _cuda as C
-
+from . import _function as F
 class OpFusedCrossEntropy(torch.autograd.Function):
     """
     CrossEntropy dim = 1
@@ -11,7 +10,7 @@ def forward(ctx, x : torch.Tensor, target : torch.Tensor, ignore_index: int):
         assert x.ndim == 2
         softmax = torch.empty(x.size(), device=x.device, dtype=x.dtype)
         out = torch.empty(x.size(0), device=x.device, dtype=torch.float)
-        C.f_cross_entropy_forward(
+        F.cross_entropy_forward(
             x.size(0), x.size(1),
             x, target,
             softmax, out,
@@ -25,7 +24,7 @@ def forward(ctx, x : torch.Tensor, target : torch.Tensor, ignore_index: int):
     def backward(ctx, grad_output : torch.Tensor):
         grad_output = grad_output.contiguous()
         softmax, target = ctx.saved_tensors
-        C.f_cross_entropy_backward_inplace(
+        F.cross_entropy_backward_inplace(
             softmax.size(0), softmax.size(1),
             grad_output, target,
             softmax,
@@ -41,7 +40,7 @@ class OpFusedCrossEntropyInplace(torch.autograd.Function):
     def forward(ctx, x : torch.Tensor, target : torch.Tensor, ignore_index: int):
         assert x.ndim == 2
         out = torch.empty(x.size(0), device=x.device, dtype=torch.float)
-        C.f_cross_entropy_forward_inplace(
+        F.cross_entropy_forward_inplace(
             x.size(0), x.size(1),
             x, target,
             out,
@@ -55,7 +54,7 @@ def forward(ctx, x : torch.Tensor, target : torch.Tensor, ignore_index: int):
     def backward(ctx, grad_output : torch.Tensor):
         grad_output = grad_output.contiguous()
         softmax, target = ctx.saved_tensors
-        C.f_cross_entropy_backward_inplace(
+        F.cross_entropy_backward_inplace(
             softmax.size(0), softmax.size(1),
             grad_output, target,
             softmax,
diff --git a/bmtrain/nccl/__init__.py b/bmtrain/nccl/__init__.py
index b6b2de91..0f4129d5 100644
--- a/bmtrain/nccl/__init__.py
+++ b/bmtrain/nccl/__init__.py
@@ -1,7 +1,7 @@
 
 from typing_extensions import Literal
 import torch
-from . import _C as C
+from .. import C
 from .enums import *
 
 class NCCLCommunicator:
diff --git a/bmtrain/optim/_function.py b/bmtrain/optim/_function.py
new file mode 100644
index 00000000..342998fa
--- /dev/null
+++ b/bmtrain/optim/_function.py
@@ -0,0 +1,80 @@
+
+from .. import C
+import torch
+CHECK_INPUT = lambda x: x.is_contiguous() and x.is_cuda
+def adam_cpu(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tensor, m_fp32: torch.Tensor,
+             v_fp32: torch.Tensor, beta1: float, beta2: float, eps: float, lr: float, scale: float,
+             weight_decay: float, step: int) -> None:
+    assert param_fp32.is_contiguous(), "param_fp32 must be contiguous"
+    assert param_fp16.is_contiguous(), "param_fp16 must be contiguous"
+    assert g_fp16.is_contiguous(), "g_fp16 must be contiguous"
+    assert m_fp32.is_contiguous(), "m_fp32 must be contiguous"
+    assert v_fp32.is_contiguous(), "v_fp32 must be contiguous"
+    assert param_fp32.dtype == torch.float32, "param_fp32 must be float32 tensor"
+    assert param_fp16.dtype == torch.float16, "param_fp16 must be float16 tensor"
+    assert g_fp16.dtype == torch.float16, "g_fp16 must be float16 tensor"
+    assert m_fp32.dtype == torch.float32, "m_fp32 must be float32 tensor"
+    assert v_fp32.dtype == torch.float32, "v_fp32 must be float32 tensor"
+    assert param_fp32.device == torch.device("cpu"), "param_fp32 must be a cpu tensor"
+    assert param_fp16.device == torch.device("cpu"), "param_fp16 must be a cpu tensor"
+    assert g_fp16.device == torch.device("cpu"), "g_fp16 must be a cpu tensor"
+    assert m_fp32.device == torch.device("cpu"), "m_fp32 must be a cpu tensor"
+    assert v_fp32.device == torch.device("cpu"), "v_fp32 must be a cpu tensor"
+    assert param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements"
+    assert param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements"
+    assert param_fp32.numel() == m_fp32.numel(), "param_fp32 and m_fp32 must have the same number of elements"
+    assert param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements"
+    bias_correction1 = 1 - beta1 ** step
+    bias_correction2 = 1 - beta2 ** step
+    stream = torch.cuda.current_stream().cuda_stream
+    C.adam_cpu_launcher(
+                    param_fp32.numel(),
+                    param_fp32.data_ptr(),
+                    param_fp16.data_ptr(),
+                    g_fp16.data_ptr(),
+                    m_fp32.data_ptr(),
+                    v_fp32.data_ptr(),
+                    beta1, beta2,
+                    eps, lr,
+                    scale,
+                    weight_decay,
+                    bias_correction1,
+                    bias_correction2,
+                    stream
+                )
+
+def adam(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tensor, m_fp16: torch.Tensor,
+             v_fp32: torch.Tensor, beta1: float, beta2: float, eps: float, lr: float, scale: float,
+             weight_decay: float, step: int) -> None:
+    assert CHECK_INPUT(param_fp32), "param_fp32 must be contiguous and on cuda"
+    assert CHECK_INPUT(param_fp16), "param_fp16 must be contiguous and on cuda"
+    assert CHECK_INPUT(g_fp16), "g_fp16 must be contiguous and on cuda"
+    assert CHECK_INPUT(m_fp16), "m_fp32 must be contiguous and on cuda"
+    assert CHECK_INPUT(v_fp32), "v_fp32 must be contiguous and on cuda"
+    assert param_fp32.dtype == torch.float32, "param_fp32 must be float32 tensor"
+    assert param_fp16.dtype == torch.float16, "param_fp16 must be float16 tensor"
+    assert g_fp16.dtype == torch.float16, "g_fp16 must be float16 tensor"
+    assert m_fp16.dtype == torch.float32, "m_fp32 must be float32 tensor"
+    assert v_fp32.dtype == torch.float32, "v_fp32 must be float32 tensor"
+    assert param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements"
+    assert param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements"
+    assert param_fp32.numel() == m_fp16.numel(), "param_fp32 and m_fp32 must have the same number of elements"
+    assert param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements"
+    bias_correction1 = 1 - beta1 ** step
+    bias_correction2 = 1 - beta2 ** step
+    stream = torch.cuda.current_stream().cuda_stream
+    C.adam_launcher(
+        param_fp32.numel(),
+        param_fp32.data_ptr(),
+        param_fp16.data_ptr(),
+        g_fp16.data_ptr(),
+        m_fp16.data_ptr(),
+        v_fp32.data_ptr(),
+        beta1, beta2,
+        eps, lr,
+        scale,
+        weight_decay,
+        bias_correction1,
+        bias_correction2,
+        stream
+    )
\ No newline at end of file
diff --git a/bmtrain/optim/adam.py b/bmtrain/optim/adam.py
index dda46f18..4cdb7914 100644
--- a/bmtrain/optim/adam.py
+++ b/bmtrain/optim/adam.py
@@ -1,7 +1,8 @@
 import torch
 from ..global_var import config
-import torch.optim._functional as F
-from . import _cuda as C
+from . import _function as F
+import torch.optim._functional
+from .. import  C
 from .. import nccl
 import inspect
 from ..utils import check_torch_version
@@ -87,7 +88,7 @@ def step(self, closure=None, scale=1):
                         grad = p.grad
                         
                     if p.dtype == torch.half:
-                        C.f_adam(
+                        F.adam(
                             state["_param_fp32"],    # fp32
                             p,                      # fp16
                             grad,                 # fp16
@@ -102,9 +103,9 @@ def step(self, closure=None, scale=1):
                         )
                     else:
                         other_kwargs = {}
-                        if 'maximize' in inspect.signature(F.adam).parameters:
+                        if 'maximize' in inspect.signature(torch.optim._functional.adam).parameters:
                             other_kwargs['maximize'] = False
-                        F.adam(
+                        torch.optim._functional.adam(
                             [p],
                             [grad / scale],
                             [state['exp_avg']],
diff --git a/bmtrain/optim/adam_offload.py b/bmtrain/optim/adam_offload.py
index 7e80b866..ecf416f6 100644
--- a/bmtrain/optim/adam_offload.py
+++ b/bmtrain/optim/adam_offload.py
@@ -1,9 +1,7 @@
 import torch
 from ..global_var import config
-from . import _cpu as C
-from . import _cuda as G
+from . import _function as F
 from .. import nccl
-import torch.optim._functional as F
 import inspect
 from ..utils import check_torch_version
 from copy import deepcopy
@@ -107,7 +105,7 @@ def step(self, closure=None, scale=1):
                     grad = -state["_grad_fp16"]
                 else:
                     grad = state["_grad_fp16"]
-                C.f_adam_cpu(
+                F.adam_cpu(
                     state["_param_fp32"].view(-1),
                     state["_param_fp16"].view(-1),
                     grad.view(-1),
@@ -128,9 +126,9 @@ def step(self, closure=None, scale=1):
                 else:
                     grad = state["_grad_fp32"]
                 other_kwargs = {}
-                if 'maximize' in inspect.signature(F.adam).parameters:
+                if 'maximize' in inspect.signature(torch.optim._functional.adam).parameters:
                     other_kwargs['maximize'] = False
-                F.adam(
+                torch.optim._functional.adam(
                     [state["_param_fp32"]],
                     [grad],
                     [state["exp_avg"]],
diff --git a/bmtrain/optim/optim_manager.py b/bmtrain/optim/optim_manager.py
index bb145748..e0943ca0 100644
--- a/bmtrain/optim/optim_manager.py
+++ b/bmtrain/optim/optim_manager.py
@@ -1,7 +1,7 @@
 from typing import Optional, Union, List, Dict, Tuple
 import torch
 
-from . import _cuda as G
+from .. import C 
 from ..utils import print_rank
 from ..lr_scheduler.warmup import WarmupLRScheduler
 from .. import nccl
@@ -13,7 +13,7 @@ def check_overflow(param_groups):
     for group in param_groups:
         for p in group['params']:
             if p.grad is not None and p.dtype == torch.half: # TODO support other types
-                G.f_has_inf_nan(p.grad, has_inf_or_nan)
+                C.f_has_inf_nan(p.grad, has_inf_or_nan)
 
     if "comm" in config:
         nccl.allReduce(has_inf_or_nan.storage(), has_inf_or_nan.storage(), "max", config["comm"])
diff --git a/cmake/FindNCCL.cmake b/cmake/FindNCCL.cmake
new file mode 100644
index 00000000..e6991733
--- /dev/null
+++ b/cmake/FindNCCL.cmake
@@ -0,0 +1,75 @@
+list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR})
+if(DEFINED ENV{NCCL_ROOT_DIR})
+  set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR})
+  set(NCCL_INCLUDE_DIR "${NCCL_ROOT_DIR}/include" CACHE PATH "Folder contains NVIDIA NCCL headers")
+  set(NCCL_LIB_DIR "${NCCL_ROOT_DIR}/lib" CACHE PATH "Folder contains NVIDIA NCCL libraries")
+else()
+  set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers")
+  set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries")
+endif()
+
+# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
+list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
+
+find_path(NCCL_INCLUDE_DIRS
+  NAMES nccl.h
+  HINTS ${NCCL_INCLUDE_DIR})
+
+if (USE_STATIC_NCCL)
+  MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
+  SET(NCCL_LIBNAME "nccl_static")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+else()
+  SET(NCCL_LIBNAME "nccl")
+  if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  endif()
+endif()
+
+find_library(NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+
+if(NCCL_FOUND)  # obtaining NCCL version and some sanity checks
+  set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h")
+  message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...")
+  set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES})
+  list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS})
+  include(CheckCXXSymbolExists)
+  check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED)
+
+  if (NCCL_VERSION_DEFINED)
+    set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc")
+    file(WRITE ${file} "
+      #include <iostream>
+      #include <nccl.h>
+      int main()
+      {
+        std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl;
+        int x;
+        ncclGetVersion(&x);
+        return x == NCCL_VERSION_CODE;
+      }
+")
+    try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file}
+          RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER
+          CMAKE_FLAGS  "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}"
+          LINK_LIBRARIES ${NCCL_LIBRARIES})
+    if (NOT NCCL_VERSION_MATCHED)
+      message(FATAL_ERROR "Found NCCL header version and library version do not match! \
+(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.")
+    endif()
+    message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}")
+  else()
+    # message(STATUS "NCCL version < 2.3.5-5")
+  endif ()
+  set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES})
+
+  message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})")
+  mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
+endif()
diff --git a/csrc/adam_cuda.cpp b/csrc/adam_cuda.cpp
deleted file mode 100644
index 2c102d94..00000000
--- a/csrc/adam_cuda.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-#include <torch/extension.h>
-#include <ATen/cuda/CUDAContext.h>
-
-void adam_launcher(const torch::Tensor &param_fp32, const torch::Tensor &param_fp16, const torch::Tensor &g_fp16, const torch::Tensor &m_fp16, const torch::Tensor &v_fp32, float beta1, float beta2, float eps, float lr, float scale, float weight_decay, float bias_correction1, float bias_correction2);
-void has_nan_inf_launcher(const torch::Tensor &g_fp16, torch::Tensor mid, torch::Tensor out);
-
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-
-void F_adam(
-    const torch::Tensor &param_fp32, 
-    const torch::Tensor &param_fp16, 
-    const torch::Tensor &g_fp16, 
-    const torch::Tensor &m_fp16, 
-    const torch::Tensor &v_fp32, 
-    float beta1, float beta2, 
-    float eps, float lr, 
-    float scale, 
-    float weight_decay,
-    int64_t step
-) {
-    CHECK_INPUT(param_fp32);
-    CHECK_INPUT(param_fp16);
-    CHECK_INPUT(g_fp16);
-    CHECK_INPUT(m_fp16);
-    CHECK_INPUT(v_fp32);
-    AT_ASSERTM(param_fp32.dtype() == torch::kFloat, "param_fp32 must be a float tensor");
-    AT_ASSERTM(param_fp16.dtype() == torch::kHalf, "param_fp16 must be a half tensor");
-    AT_ASSERTM(g_fp16.dtype() == torch::kHalf, "g_fp16 must be a half tensor");
-    AT_ASSERTM(m_fp16.dtype() == torch::kHalf, "m_fp16 must be a half tensor");
-    AT_ASSERTM(v_fp32.dtype() == torch::kFloat, "v_fp32 must be a float tensor");
-    AT_ASSERTM(param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements");
-    AT_ASSERTM(param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements");
-    AT_ASSERTM(param_fp32.numel() == m_fp16.numel(), "param_fp32 and m_fp16 must have the same number of elements");
-    AT_ASSERTM(param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements");
-
-    float bias_correction1 = 1 - powf(beta1, step);
-    float bias_correction2 = 1 - powf(beta2, step);
-
-    adam_launcher(param_fp32, param_fp16, g_fp16, m_fp16, v_fp32, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2);
-}
-
-void F_has_inf_nan(const torch::Tensor &g_fp16, torch::Tensor &out) {
-    CHECK_INPUT(g_fp16);
-    CHECK_INPUT(out);
-    AT_ASSERTM(g_fp16.dtype() == torch::kHalf, "g_fp16 must be a half tensor");
-    AT_ASSERTM(out.dtype() == torch::kUInt8, "out must be a uint8 tensor");
-
-    torch::Tensor mid = out.new_zeros({1024});
-
-    has_nan_inf_launcher(g_fp16, mid, out);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("f_adam", &F_adam, "adam function");
-    m.def("f_has_inf_nan", &F_has_inf_nan, "has inf or nan");
-}
diff --git a/csrc/bind.cpp b/csrc/bind.cpp
new file mode 100644
index 00000000..8324ba52
--- /dev/null
+++ b/csrc/bind.cpp
@@ -0,0 +1,25 @@
+#include "include/bind.hpp"
+
+PYBIND11_MODULE(C, m) {
+    m.def("has_nan_inf_launcher",&has_nan_inf_launcher,"has nan inf");
+    m.def("adam_launcher", &adam_launcher, "adam function cpu");
+    m.def("adam_cpu_launcher", &adam_cpu_launcher, "adam function cpu");
+    m.def("cross_entropy_forward_launcher", &cross_entropy_forward_launcher, "cross entropy forward");
+    m.def("cross_entropy_backward_launcher", &cross_entropy_backward_launcher, "cross entropy backward");
+    m.def("cross_entropy_forward_inplace_launcher", &cross_entropy_forward_inplace_launcher, "cross entropy forward inplace");
+    m.def("cross_entropy_backward_inplace_launcher", &cross_entropy_backward_inplace_launcher, "cross entropy backward inplace");
+    m.def("ncclGetUniqueId", &pyNCCLGetUniqueID, "nccl get unique ID");
+    m.def("ncclCommInitRank", &pyNCCLCommInitRank, "nccl init rank");
+    m.def("ncclCommDestroy", &pyNCCLCommDestroy, "nccl delete rank");
+    m.def("ncclAllGather", &pyNCCLAllGather, "nccl all gather");
+    m.def("ncclAllReduce", &pyNCCLAllReduce, "nccl all reduce");
+    m.def("ncclBroadcast", &pyNCCLBroadcast, "nccl broadcast");
+    m.def("ncclReduce", &pyNCCLReduce, "nccl reduce");
+    m.def("ncclReduceScatter", &pyNCCLReduceScatter, "nccl reduce scatter");
+    m.def("ncclGroupStart", &pyNCCLGroupStart, "nccl group start");
+    m.def("ncclGroupEnd", &pyNCCLGroupEnd, "nccl group end");
+    m.def("ncclSend",&pyNCCLSend,"nccl send");
+    m.def("ncclRecv",&pyNCCLRecv,"nccl recv");
+    m.def("ncclCommCount",&pyNCCLCommCount,"nccl comm count");
+    m.def("ncclCommUserRank",&pyNCCLCommUserRank,"nccl comm user rank");
+}
diff --git a/csrc/cross_entropy_loss.cpp b/csrc/cross_entropy_loss.cpp
deleted file mode 100644
index 40adef36..00000000
--- a/csrc/cross_entropy_loss.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#include <torch/extension.h>
-#include <ATen/cuda/CUDAContext.h>
-
-void cross_entropy_forward_launcher(int32_t m, int32_t n, const torch::Tensor &input, const torch::Tensor &target, torch::Tensor &softmax, torch::Tensor &output, int32_t ignore_index);
-void cross_entropy_backward_launcher(int32_t m, int32_t n, const torch::Tensor &grad_output, const torch::Tensor &target, const torch::Tensor &softmax, torch::Tensor &grad_input, int32_t ignore_index);
-void cross_entropy_forward_inplace_launcher(int32_t m, int32_t n, torch::Tensor &x, const torch::Tensor &target, torch::Tensor &output, int32_t ignore_index);
-void cross_entropy_backward_inplace_launcher(int32_t m, int32_t n, const torch::Tensor &grad_output, const torch::Tensor &target, torch::Tensor &x, int32_t ignore_index);
-
-#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x);
-
-void F_cross_entropy_forward(
-    int32_t m, int32_t n,
-    const torch::Tensor &input,     // (m, n)
-    const torch::Tensor &target,    // (m)
-    torch::Tensor &softmax,         // (m, n)
-    torch::Tensor &output,          // (m)
-    int32_t ignore_index
-) {
-    CHECK_INPUT(input);
-    CHECK_INPUT(target);
-    CHECK_INPUT(softmax);
-    CHECK_INPUT(output);
-    AT_ASSERTM(input.dtype() == torch::kHalf, "input must be a half tensor");
-    AT_ASSERTM(target.dtype() == torch::kInt, "target must be a int tensor");
-    AT_ASSERTM(softmax.dtype() == torch::kHalf, "softmax must be a half tensor");
-    AT_ASSERTM(output.dtype() == torch::kFloat, "output must be a float tensor");
-    AT_ASSERTM(input.numel() == softmax.numel(), "input and softmax must have the same number of elements");
-    AT_ASSERTM(target.numel() == output.numel(), "target and output must have the same number of elements");
-
-    cross_entropy_forward_launcher(m, n, input, target, softmax, output, ignore_index);
-}
-
-void F_cross_entropy_backward(
-    int32_t m, int32_t n,
-    const torch::Tensor &grad_output,   // (m)
-    const torch::Tensor &target,        // (m) 
-    const torch::Tensor &softmax,       // (m, n)
-    torch::Tensor &grad_input,          // (m, n)
-    int32_t ignore_index
-) {
-    CHECK_INPUT(grad_output);
-    CHECK_INPUT(target);
-    CHECK_INPUT(softmax);
-    CHECK_INPUT(grad_input);
-    AT_ASSERTM(grad_output.dtype() == torch::kFloat, "grad_output must be a float tensor");
-    AT_ASSERTM(target.dtype() == torch::kInt, "target must be a int tensor");
-    AT_ASSERTM(softmax.dtype() == torch::kHalf, "softmax must be a half tensor");
-    AT_ASSERTM(grad_input.dtype() == torch::kHalf, "grad_input must be a half tensor");
-    AT_ASSERTM(grad_input.numel() == softmax.numel(), "grad_input and softmax must have the same number of elements");
-    AT_ASSERTM(target.numel() == grad_output.numel(), "target and grad_output must have the same number of elements");
-
-    cross_entropy_backward_launcher(m, n, grad_output, target, softmax, grad_input, ignore_index);
-}
-
-void F_cross_entropy_forward_inplace(
-    int32_t m, int32_t n,
-    torch::Tensor &x,               // (m, n)
-    const torch::Tensor &target,    // (m)
-    torch::Tensor &output,          // (m)
-    int32_t ignore_index
-) {
-    CHECK_INPUT(x);
-    CHECK_INPUT(target);
-    CHECK_INPUT(output);
-    AT_ASSERTM(x.dtype() == torch::kHalf, "x must be a half tensor");
-    AT_ASSERTM(target.dtype() == torch::kInt, "target must be a int tensor");
-    AT_ASSERTM(output.dtype() == torch::kFloat, "output must be a float tensor");
-    AT_ASSERTM(target.numel() == output.numel(), "target and output must have the same number of elements");
-
-    cross_entropy_forward_inplace_launcher(m, n, x, target, output, ignore_index);
-}
-
-void F_cross_entropy_backward_inplace(
-    int32_t m, int32_t n,
-    const torch::Tensor &grad_output,   // (m)
-    const torch::Tensor &target,        // (m) 
-    torch::Tensor &x,                   // (m, n)
-    int32_t ignore_index
-) {
-    CHECK_INPUT(grad_output);
-    CHECK_INPUT(target);
-    CHECK_INPUT(x);
-    AT_ASSERTM(grad_output.dtype() == torch::kFloat, "grad_output must be a float tensor");
-    AT_ASSERTM(target.dtype() == torch::kInt, "target must be a int tensor");
-    AT_ASSERTM(x.dtype() == torch::kHalf, "x must be a half tensor");
-    AT_ASSERTM(target.numel() == grad_output.numel(), "target and grad_output must have the same number of elements");
-
-    cross_entropy_backward_inplace_launcher(m, n, grad_output, target, x, ignore_index);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("f_cross_entropy_forward", &F_cross_entropy_forward, "cross entropy forward");
-    m.def("f_cross_entropy_backward", &F_cross_entropy_backward, "cross entropy backward");
-    m.def("f_cross_entropy_forward_inplace", &F_cross_entropy_forward_inplace, "cross entropy forward inplace");
-    m.def("f_cross_entropy_backward_inplace", &F_cross_entropy_backward_inplace, "cross entropy backward inplace");
-}
diff --git a/csrc/cuda/adam.cu b/csrc/cuda/adam_cuda.cu
similarity index 62%
rename from csrc/cuda/adam.cu
rename to csrc/cuda/adam_cuda.cu
index c7bff5ca..0ab55934 100644
--- a/csrc/cuda/adam.cu
+++ b/csrc/cuda/adam_cuda.cu
@@ -1,6 +1,5 @@
 #include <cuda_fp16.h>
-#include <torch/extension.h>
-#include <ATen/cuda/CUDAContext.h>
+#include <cstdint>
 
 namespace {
 // blocks <n // 1024>,      threads<min(n, 1024)>
@@ -37,28 +36,28 @@ __global__ void adam_fp32_accum(
 }
 
 void adam_launcher(
-    const torch::Tensor &param_fp32,
-    const torch::Tensor &param_fp16,
-    const torch::Tensor &g_fp16,
-    const torch::Tensor &m_fp16,
-    const torch::Tensor &v_fp32,
+    int n,
+    std::uintptr_t param_fp32,
+    std::uintptr_t param_fp16,
+    std::uintptr_t g_fp16,
+    std::uintptr_t m_fp16,
+    std::uintptr_t v_fp32,
     float beta1, float beta2,
     float eps, float lr,
     float scale,
     float weight_decay,
     float bias_correction1,
-    float bias_correction2
+    float bias_correction2,
+    uintptr_t stream
 ) {
-    int32_t n = param_fp32.numel();
     if (n <= 0) return;
-    auto g_ptr = reinterpret_cast<half*>(g_fp16.data_ptr<at::Half>());
-    auto m_ptr = reinterpret_cast<half*>(m_fp16.data_ptr<at::Half>());
-    auto v_ptr = v_fp32.data_ptr<float>();
-    auto param_ptr = param_fp32.data_ptr<float>();
-    auto param_h_ptr = reinterpret_cast<half*>(param_fp16.data_ptr<at::Half>());
+    auto g_ptr = reinterpret_cast<half*>(g_fp16);
+    auto m_ptr = reinterpret_cast<half*>(m_fp16);
+    auto param_h_ptr = reinterpret_cast<half*>(param_fp16);
+    auto param_fp32_ptr = reinterpret_cast<float*>(param_fp32);
+    auto v_fp32_ptr = reinterpret_cast<float*>(v_fp32);
     int32_t threads = 1024;
     dim3 block_size = dim3(threads, 1, 1);
     dim3 grid_size = dim3((n + threads - 1) / threads, 1, 1);
-    auto stream = at::cuda::getCurrentCUDAStream();
-    adam_fp32_accum<<<grid_size, block_size, 0, stream.stream()>>>(n, g_ptr, m_ptr, v_ptr, param_ptr, param_h_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2);
+    adam_fp32_accum<<<grid_size, block_size, 0, reinterpret_cast<cudaStream_t>(stream)>>>(n, g_ptr, m_ptr, v_fp32_ptr, param_fp32_ptr, param_h_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2);
 }
\ No newline at end of file
diff --git a/csrc/cuda/cross_entropy.cu b/csrc/cuda/cross_entropy.cu
index 574ebde0..c0b742ac 100644
--- a/csrc/cuda/cross_entropy.cu
+++ b/csrc/cuda/cross_entropy.cu
@@ -1,7 +1,7 @@
 #include <cuda_fp16.h>
-#include <torch/extension.h>
-#include <ATen/cuda/CUDAContext.h>
 #include "reduce.cuh"
+#include <cstdint>
+#include<cuda_runtime_api.h>
 
 namespace {
 // blocks <m>,      threads<1024>
@@ -134,64 +134,74 @@ __global__ void cross_entropy_backward_inplace(
 
 void cross_entropy_forward_launcher(
     int32_t m, int32_t n,
-    const torch::Tensor &input,
-    const torch::Tensor &target,
-    torch::Tensor &softmax,
-    torch::Tensor &output,
-    int32_t ignore_index
+    std::uintptr_t input,
+    std::uintptr_t target,
+    std::uintptr_t softmax,
+    std::uintptr_t output,
+    int32_t ignore_index,
+    std::uintptr_t stream
 ) {
-    auto input_ptr = reinterpret_cast<half*>(input.data_ptr<at::Half>());
-    auto target_ptr = target.data_ptr<int32_t>();
-    auto softmax_ptr = reinterpret_cast<half*>(softmax.data_ptr<at::Half>());
-    auto output_ptr = output.data_ptr<float>();
+    auto input_ptr = reinterpret_cast<half*>(input);
+    auto target_ptr = reinterpret_cast<int32_t*>(target);
+    auto softmax_ptr = reinterpret_cast<half*>(softmax);
+    auto output_ptr = reinterpret_cast<float*>(output);
     int32_t threads = 1024;
-    auto stream = at::cuda::getCurrentCUDAStream();
-    cross_entropy_forward<<<m, threads, 0, stream.stream()>>>(n, input_ptr, target_ptr, softmax_ptr, output_ptr, ignore_index);
+    cross_entropy_forward<<<m, threads, 0, reinterpret_cast<cudaStream_t>(stream)>>>(n, input_ptr, target_ptr, softmax_ptr, output_ptr, ignore_index);
 }
 
 void cross_entropy_backward_launcher(
     int32_t m, int32_t n,
-    const torch::Tensor &grad_output,
-    const torch::Tensor &target,
-    const torch::Tensor &softmax,
-    torch::Tensor &grad_input,
-    int32_t ignore_index
+    std::uintptr_t grad_output,
+    std::uintptr_t target,
+    std::uintptr_t softmax,
+    std::uintptr_t grad_input,
+    int32_t ignore_index,
+    std::uintptr_t stream
 ) {
-    auto output_ptr = grad_output.data_ptr<float>();
-    auto target_ptr = target.data_ptr<int32_t>();
-    auto softmax_ptr = reinterpret_cast<half*>(softmax.data_ptr<at::Half>());
-    auto input_ptr = reinterpret_cast<half*>(grad_input.data_ptr<at::Half>());
+    // auto output_ptr = grad_output.data_ptr<float>();
+    auto output_ptr = reinterpret_cast<float*>(grad_output);
+    // auto target_ptr = target.data_ptr<int32_t>();
+    auto target_ptr = reinterpret_cast<int32_t*>(target);
+    auto softmax_ptr = reinterpret_cast<half*>(softmax);
+    auto input_ptr = reinterpret_cast<half*>(grad_input);
     int32_t threads = 1024;
-    auto stream = at::cuda::getCurrentCUDAStream();
-    cross_entropy_backward<<<m, threads, 0, stream.stream()>>>(n, output_ptr, target_ptr, softmax_ptr, input_ptr, ignore_index);
+    cross_entropy_backward<<<m, threads, 0, reinterpret_cast<cudaStream_t>(stream)>>>(n, output_ptr, target_ptr, softmax_ptr, input_ptr, ignore_index);
 }
 
 void cross_entropy_forward_inplace_launcher(
     int32_t m, int32_t n,
-    torch::Tensor &x,
-    const torch::Tensor &target,
-    torch::Tensor &output,
-    int32_t ignore_index
+    std::uintptr_t x,
+    std::uintptr_t target,
+    std::uintptr_t output,
+    int32_t ignore_index,
+    std::uintptr_t stream
 ) {
-    auto x_ptr = reinterpret_cast<half*>(x.data_ptr<at::Half>());
-    auto target_ptr = target.data_ptr<int32_t>();
-    auto output_ptr = output.data_ptr<float>();
+    // auto x_ptr = reinterpret_cast<half*>(x.data_ptr<at::Half>());
+    auto x_ptr = reinterpret_cast<half*>(x);
+    // auto target_ptr = target.data_ptr<int32_t>();
+    auto target_ptr = reinterpret_cast<int32_t*>(target);
+    // auto output_ptr = output.data_ptr<float>();
+    auto output_ptr = reinterpret_cast<float*>(output);
     int32_t threads = 1024;
-    auto stream = at::cuda::getCurrentCUDAStream();
-    cross_entropy_forward_inplace<<<m, threads, 0, stream.stream()>>>(n, x_ptr, target_ptr, output_ptr, ignore_index);
+    // auto stream = at::cuda::getCurrentCUDAStream();
+    cross_entropy_forward_inplace<<<m, threads, 0, reinterpret_cast<cudaStream_t>(stream)>>>(n, x_ptr, target_ptr, output_ptr, ignore_index);
 }
 
 void cross_entropy_backward_inplace_launcher(
     int32_t m, int32_t n,
-    const torch::Tensor &grad_output,
-    const torch::Tensor &target,
-    torch::Tensor &x,
-    int32_t ignore_index
+    std::uintptr_t grad_output,
+    std::uintptr_t target,
+    std::uintptr_t x,
+    int32_t ignore_index,
+    std::uintptr_t stream
 ) {
-    auto output_ptr = grad_output.data_ptr<float>();
-    auto target_ptr = target.data_ptr<int32_t>();
-    auto x_ptr = reinterpret_cast<half*>(x.data_ptr<at::Half>());
+    // auto output_ptr = grad_output.data_ptr<float>();
+    auto output_ptr = reinterpret_cast<float*>(grad_output);
+    // auto target_ptr = target.data_ptr<int32_t>();
+    auto target_ptr = reinterpret_cast<int32_t*>(target);
+    // auto x_ptr = reinterpret_cast<half*>(x.data_ptr<at::Half>());
+    auto x_ptr = reinterpret_cast<half*>(x);
     int32_t threads = 1024;
-    auto stream = at::cuda::getCurrentCUDAStream();
-    cross_entropy_backward_inplace<<<m, threads, 0, stream.stream()>>>(n, output_ptr, target_ptr, x_ptr, ignore_index);
+    // auto stream = at::cuda::getCurrentCUDAStream();
+    cross_entropy_backward_inplace<<<m, threads, 0, reinterpret_cast<cudaStream_t>(stream)>>>(n, output_ptr, target_ptr, x_ptr, ignore_index);
 }
\ No newline at end of file
diff --git a/csrc/cuda/has_inf_nan.cu b/csrc/cuda/has_inf_nan.cu
index c8bcca1a..b0e906ff 100644
--- a/csrc/cuda/has_inf_nan.cu
+++ b/csrc/cuda/has_inf_nan.cu
@@ -1,6 +1,5 @@
 #include <cuda_fp16.h>
-#include <torch/extension.h>
-#include <ATen/cuda/CUDAContext.h>
+#include <cstdint>
 
 namespace{
 __inline__ __device__ bool isnan_(half v) {
@@ -68,21 +67,21 @@ __global__ void bmt_has_nan_inf_2(
 }
 
 void has_nan_inf_launcher(
-    const torch::Tensor &g_fp16,
-    torch::Tensor mid,
-    torch::Tensor out
+    int32_t n,
+    std::uintptr_t g_fp16,
+    std::uintptr_t mid,
+    std::uintptr_t out,
+    std::uintptr_t stream
 ) {
-    int n = g_fp16.numel();
     if (n <= 0) return;
-    auto g_ptr = reinterpret_cast<half*>(g_fp16.data_ptr<at::Half>());
-    auto mid_ptr = mid.data_ptr<uint8_t>();
-    auto stream = at::cuda::getCurrentCUDAStream();
-
+    auto g_ptr = reinterpret_cast<half*>(g_fp16);
+    auto mid_ptr = reinterpret_cast<uint8_t*>(mid);
+    auto out_ptr = reinterpret_cast<uint8_t*>(out);
     int32_t threads = 1024;
     dim3 block_size = dim3(threads, 1, 1);
     dim3 grid_size = dim3((n + threads - 1) / threads, 1, 1);
     dim3 clamp_grid_size = dim3(min((n + threads - 1) / threads, 1024), 1, 1);
     
-    bmt_has_nan_inf_1<<<clamp_grid_size, block_size, 0, stream.stream()>>>(n, g_ptr, mid_ptr);
-    bmt_has_nan_inf_2<<<1, block_size, 0, stream.stream()>>>(mid_ptr, out.data_ptr<uint8_t>());
+    bmt_has_nan_inf_1<<<clamp_grid_size, block_size, 0, reinterpret_cast<cudaStream_t>(stream)>>>(n, g_ptr, mid_ptr);
+    bmt_has_nan_inf_2<<<1, block_size, 0, reinterpret_cast<cudaStream_t>(stream)>>>(mid_ptr, out_ptr);
 }
\ No newline at end of file
diff --git a/csrc/cuda/reduce.cuh b/csrc/cuda/reduce.cuh
index 8179443f..095e8593 100644
--- a/csrc/cuda/reduce.cuh
+++ b/csrc/cuda/reduce.cuh
@@ -1,6 +1,4 @@
 #include <cuda_fp16.h>
-#include <torch/extension.h>
-#include <ATen/cuda/CUDAContext.h>
 
 namespace {
 const int WARP_SZ = 32;
diff --git a/csrc/adam_cpu.cpp b/csrc/include/adam_cpu.hpp
similarity index 57%
rename from csrc/adam_cpu.cpp
rename to csrc/include/adam_cpu.hpp
index b8685b60..1c263b65 100644
--- a/csrc/adam_cpu.cpp
+++ b/csrc/include/adam_cpu.hpp
@@ -1,8 +1,11 @@
-#include <torch/extension.h>
-#include <ATen/ParallelOpenMP.h>
-#include <immintrin.h>
 #include <emmintrin.h>
+#include <immintrin.h>
+#include <cmath>
+#include <cstdint>
 
+#include <pybind11/pybind11.h>
+
+#include<cuda_fp16.h>
 #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
 
 #if defined(__AVX512F__)
@@ -14,14 +17,118 @@
 
 #pragma message "Using AVX256"
 #define __AVX256__ 1
+#endif
+#include <vector>
+#include <thread>
+#include <algorithm>
+
+inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32 = {w};
+    return fp32.as_value;
+}
+
+inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32 = {f};
+    return fp32.as_bits;
+}
+
+template <class F>
+inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F& f) {
+    // Number of iterations
+    int64_t numiter = end - begin;
+    
+    // Number of threads to use
+    int64_t num_threads = std::max(numiter / grain_size, static_cast<int64_t>(1));
+
+    // Check if parallel execution is feasible
+    if (num_threads > 1) {
+        std::vector<std::thread> threads(num_threads);
+        for (int64_t t = 0; t < num_threads; ++t) {
+            threads[t] = std::thread([&, t]() {
+                int64_t start = begin + t * grain_size;
+                int64_t end = std::min(begin + (t + 1) * grain_size, end);
+                f(start, end);
+            });
+        }
+        // Join all threads
+        for (auto& thread : threads) {
+            thread.join();
+        }
+    } else {
+        // If not feasible, perform the operation serially
+        f(begin, end);
+    }
+}
+
+
+inline uint16_t fp16_ieee_from_fp32_value(float f) {
+    // const float scale_to_inf = 0x1.0p+112f;
+    // const float scale_to_zero = 0x1.0p-110f;
+    uint32_t scale_to_inf_bits = (uint32_t) 239 << 23;
+    uint32_t scale_to_zero_bits = (uint32_t) 17 << 23;
+    float scale_to_inf_val, scale_to_zero_val;
+    std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val));
+    std::memcpy(&scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val));
+    const float scale_to_inf = scale_to_inf_val;
+    const float scale_to_zero = scale_to_zero_val;
 
+#if defined(_MSC_VER) && _MSC_VER == 1916
+          float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero;
+#else
+          float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
 #endif
 
+          const uint32_t w = (uint32_t)fp32_to_bits(f);
+          const uint32_t shl1_w = w + w;
+          const uint32_t sign = w & UINT32_C(0x80000000);
+          uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+          if (bias < UINT32_C(0x71000000)) {
+                  bias = UINT32_C(0x71000000);
+          }
+
+          base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+          const uint32_t bits = (uint32_t)fp32_to_bits(base);
+          const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+          const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+          const uint32_t nonsign = exp_bits + mantissa_bits;
+          return static_cast<uint16_t>(
+            (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign)
+          );
+  }
+
+inline float fp16_ieee_to_fp32_value(uint16_t h) {
+
+  const uint32_t w = (uint32_t)h << 16;
+  const uint32_t sign = w & UINT32_C(0x80000000);
+  const uint32_t two_w = w + w;
+
+  const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+  const float exp_scale = 0x1.0p-112f;
+  const float normalized_value =
+      fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+  const uint32_t magic_mask = UINT32_C(126) << 23;
+  const float magic_bias = 0.5f;
+  const float denormalized_value =
+      fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+  const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+  const uint32_t result =
+      sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value)
+                                          : fp32_to_bits(normalized_value));
+  return  fp32_from_bits(result);
+}
 void adam_cpu_launcher(
     int n,
     float* param_fp32,
-    at::Half* param_fp16,
-    at::Half* g_fp16,
+    u_int16_t* param_fp16,
+    u_int16_t* g_fp16,
     float* m_fp32,
     float* v_fp32,
     float beta1, float beta2, 
@@ -59,7 +166,7 @@ void adam_cpu_launcher(
     int64_t span = 1;
 #endif
 
-    at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
+    parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
         for (int64_t j = start; j < end; j += span) {
 #if defined(__AVX256__) or defined(__AVX512__)
             if (j + span > end) {
@@ -68,7 +175,7 @@ void adam_cpu_launcher(
 #endif
                 // No AVX or n is not alinged
                 for (int64_t i = j; i < end; i++) {
-                    float g = c10::detail::fp16_ieee_to_fp32_value(g_fp16[i].x) / scale;
+                    float g = fp16_ieee_to_fp32_value(g_fp16[i]) / scale;
                     float m = m_fp32[i];
                     float v = v_fp32[i];
                     float p = param_fp32[i];
@@ -76,7 +183,7 @@ void adam_cpu_launcher(
                     v = beta2 * v + (1 - beta2) * g * g;
                     p = p - lr * m  / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p;
                     param_fp32[i] = p;
-                    param_fp16[i] = at::Half(p);
+                    param_fp16[i] = fp16_ieee_from_fp32_value(p);
                     m_fp32[i] = m;
                     v_fp32[i] = v;
                 }
@@ -133,57 +240,4 @@ void adam_cpu_launcher(
     });
 }
 
-void F_adam_cpu(
-    const torch::Tensor &param_fp32, 
-    const torch::Tensor &param_fp16, 
-    const torch::Tensor &g_fp16, 
-    const torch::Tensor &m_fp32, 
-    const torch::Tensor &v_fp32, 
-    float beta1, float beta2, 
-    float eps, float lr, 
-    float scale, 
-    float weight_decay,
-    int64_t step
-) {
-    CHECK_CONTIGUOUS(param_fp32);
-    CHECK_CONTIGUOUS(param_fp16);
-    CHECK_CONTIGUOUS(g_fp16);
-    CHECK_CONTIGUOUS(m_fp32);
-    CHECK_CONTIGUOUS(v_fp32);
-    AT_ASSERTM(param_fp32.dtype() == torch::kFloat, "param_fp32 must be a float tensor");
-    AT_ASSERTM(param_fp16.dtype() == torch::kHalf, "param_fp16 must be a half tensor");
-    AT_ASSERTM(g_fp16.dtype() == torch::kHalf, "g_fp16 must be a half tensor");
-    AT_ASSERTM(m_fp32.dtype() == torch::kFloat, "m_fp32 must be a float tensor");
-    AT_ASSERTM(v_fp32.dtype() == torch::kFloat, "v_fp32 must be a float tensor");
-    AT_ASSERTM(param_fp32.is_cpu(), "param_fp32 must be a cpu tensor");
-    AT_ASSERTM(param_fp16.is_cpu(), "param_fp16 must be a cpu tensor");
-    AT_ASSERTM(g_fp16.is_cpu(), "g_fp16 must be a cpu tensor");
-    AT_ASSERTM(m_fp32.is_cpu(), "m_fp32 must be a cpu tensor");
-    AT_ASSERTM(v_fp32.is_cpu(), "v_fp32 must be a cpu tensor");
-    AT_ASSERTM(param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements");
-    AT_ASSERTM(param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements");
-    AT_ASSERTM(param_fp32.numel() == m_fp32.numel(), "param_fp32 and m_fp32 must have the same number of elements");
-    AT_ASSERTM(param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements");
-
-    float bias_correction1 = 1 - powf(beta1, step);
-    float bias_correction2 = 1 - powf(beta2, step);
-
-    adam_cpu_launcher(
-        param_fp32.numel(),
-        param_fp32.data_ptr<float>(),
-        param_fp16.data_ptr<at::Half>(),
-        g_fp16.data_ptr<at::Half>(),
-        m_fp32.data_ptr<float>(),
-        v_fp32.data_ptr<float>(),
-        beta1, beta2, 
-        eps, lr, 
-        scale, 
-        weight_decay,
-        bias_correction1,
-        bias_correction2
-    );
-} 
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("f_adam_cpu", &F_adam_cpu, "adam function cpu");
-}
+
diff --git a/csrc/include/bind.hpp b/csrc/include/bind.hpp
new file mode 100644
index 00000000..0929de91
--- /dev/null
+++ b/csrc/include/bind.hpp
@@ -0,0 +1,55 @@
+#include<pybind11/pybind11.h>
+#include "nccl.hpp"
+#include "adam_cpu.hpp"
+
+void has_nan_inf_launcher(int32_t n,std::uintptr_t g_fp16,std::uintptr_t mid,std::uintptr_t out,std::uintptr_t stream);
+
+void cross_entropy_backward_launcher(
+    int32_t m, int32_t n,
+    std::uintptr_t grad_output,
+    std::uintptr_t target,
+    std::uintptr_t softmax,
+    std::uintptr_t grad_input,
+    int32_t ignore_index,
+    std::uintptr_t stream
+);
+void cross_entropy_backward_inplace_launcher(
+    int32_t m, int32_t n,
+    std::uintptr_t grad_output,
+    std::uintptr_t target,
+    std::uintptr_t x,
+    int32_t ignore_index,
+    std::uintptr_t stream
+);
+ void cross_entropy_forward_inplace_launcher(
+    int32_t m, int32_t n,
+    std::uintptr_t x,
+    std::uintptr_t target,
+    std::uintptr_t output,
+    int32_t ignore_index,
+    std::uintptr_t stream
+);
+void cross_entropy_forward_launcher(
+    int32_t m, int32_t n,
+    std::uintptr_t input,
+    std::uintptr_t target,
+    std::uintptr_t softmax,
+    std::uintptr_t output,
+    int32_t ignore_index,
+    std::uintptr_t stream
+);
+void adam_launcher(
+    int n,
+    std::uintptr_t param_fp32,
+    std::uintptr_t param_fp16,
+    std::uintptr_t g_fp16,
+    std::uintptr_t m_fp16,
+    std::uintptr_t v_fp32,
+    float beta1, float beta2,
+    float eps, float lr,
+    float scale,
+    float weight_decay,
+    float bias_correction1,
+    float bias_correction2,
+    uintptr_t stream
+);
\ No newline at end of file
diff --git a/csrc/nccl.cpp b/csrc/include/nccl.hpp
similarity index 83%
rename from csrc/nccl.cpp
rename to csrc/include/nccl.hpp
index c1dbb321..297fa5ab 100644
--- a/csrc/nccl.cpp
+++ b/csrc/include/nccl.hpp
@@ -3,7 +3,7 @@
 #include <pybind11/pybind11.h>
 
 namespace py = pybind11;
-#include "include/nccl.h"
+#include "nccl.h"
 
 void checkNCCLStatus(ncclResult_t result) {
     if (result == ncclSuccess) return;
@@ -186,19 +186,3 @@ int pyNCCLCommUserRank(
     checkNCCLStatus(ncclCommUserRank(reinterpret_cast<ncclComm_t>(comm),&rank));
     return rank;
 }
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.def("ncclGetUniqueId", &pyNCCLGetUniqueID, "nccl get unique ID");
-    m.def("ncclCommInitRank", &pyNCCLCommInitRank, "nccl init rank");
-    m.def("ncclCommDestroy", &pyNCCLCommDestroy, "nccl delete rank");
-    m.def("ncclAllGather", &pyNCCLAllGather, "nccl all gather");
-    m.def("ncclAllReduce", &pyNCCLAllReduce, "nccl all reduce");
-    m.def("ncclBroadcast", &pyNCCLBroadcast, "nccl broadcast");
-    m.def("ncclReduce", &pyNCCLReduce, "nccl reduce");
-    m.def("ncclReduceScatter", &pyNCCLReduceScatter, "nccl reduce scatter");
-    m.def("ncclGroupStart", &pyNCCLGroupStart, "nccl group start");
-    m.def("ncclGroupEnd", &pyNCCLGroupEnd, "nccl group end");
-    m.def("ncclSend",&pyNCCLSend,"nccl send");
-    m.def("ncclRecv",&pyNCCLRecv,"nccl recv");
-    m.def("ncclCommCount",&pyNCCLCommCount,"nccl comm count");
-    m.def("ncclCommUserRank",&pyNCCLCommUserRank,"nccl comm user rank");
-}
diff --git a/setup.py b/setup.py
index b7411f4c..26fcfed3 100644
--- a/setup.py
+++ b/setup.py
@@ -1,216 +1,102 @@
-from setuptools import setup, find_packages
 import torch
-from setuptools import Extension
-from setuptools.command.build_ext import build_ext
-import os
 import os
+from setuptools.command.build_ext import build_ext
+from setuptools import  setup, find_packages, Extension
+from torch.utils.cpp_extension import BuildExtension
 import setuptools
+import warnings
+import sys
 import subprocess
-from pkg_resources import packaging
-from typing import Dict, List, Optional, Union, Tuple
-def CppExtension(name, sources, *args, **kwargs):
-    include_dirs = kwargs.get('include_dirs', [])
-    include_dirs += include_paths()
-    kwargs['include_dirs'] = include_dirs
-
-    library_dirs = kwargs.get('library_dirs', [])
-    library_dirs += library_paths()
-    kwargs['library_dirs'] = library_dirs
-
-    libraries = kwargs.get('libraries', [])
-    kwargs['libraries'] = libraries
-
-    kwargs['language'] = 'c++'
-    return setuptools.Extension(name, sources, *args, **kwargs)
-
-
-def CUDAExtension(name, sources, *args, **kwargs):
-    library_dirs = kwargs.get('library_dirs', [])
-    library_dirs += library_paths(cuda=True)
-    kwargs['library_dirs'] = library_dirs
-
-    libraries = kwargs.get('libraries', [])
-    kwargs['libraries'] = libraries
-
-    include_dirs = kwargs.get('include_dirs', [])
-
-
-    include_dirs += include_paths(cuda=True)
-    kwargs['include_dirs'] = include_dirs
-
-    kwargs['language'] = 'c++'
-
-    dlink_libraries = kwargs.get('dlink_libraries', [])
-    dlink = kwargs.get('dlink', False) or dlink_libraries
-    if dlink:
-        extra_compile_args = kwargs.get('extra_compile_args', {})
-
-        extra_compile_args_dlink = extra_compile_args.get('nvcc_dlink', [])
-        extra_compile_args_dlink += ['-dlink']
-        extra_compile_args_dlink += [f'-L{x}' for x in library_dirs]
-        extra_compile_args_dlink += [f'-l{x}' for x in dlink_libraries]
-
-
-        extra_compile_args['nvcc_dlink'] = extra_compile_args_dlink
-
-        kwargs['extra_compile_args'] = extra_compile_args
-
-    return setuptools.Extension(name, sources, *args, **kwargs)
-
-def _find_cuda_home() -> Optional[str]:
-    r'''Finds the CUDA install path.'''
-    # Guess #1
-    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
-    if cuda_home is None:
-        # Guess #2
+COMMON_NVCC_FLAGS = [
+    '-D__CUDA_NO_HALF_OPERATORS__',
+    '-D__CUDA_NO_HALF_CONVERSIONS__',
+    '-D__CUDA_NO_HALF2_OPERATORS__',
+    '--expt-relaxed-constexpr'
+]
+class CMakeExtension(Extension):
+    def __init__(self, name, sourcedir=""):
+        Extension.__init__(self, name, sources=[])
+        self.sourcedir = os.path.abspath(sourcedir)
+
+
+def is_ninja_available():
+    r'''
+    Returns ``True`` if the `ninja <https://ninja-build.org/>`_ build system is
+    available on the system, ``False`` otherwise.
+    '''
+    with open(os.devnull, 'wb') as devnull:
         try:
-            which = 'which'
-            with open(os.devnull, 'w') as devnull:
-                nvcc = subprocess.check_output([which, 'nvcc'],
-                                               stderr=devnull).decode(*SUBPROCESS_DECODE_ARGS).rstrip('\r\n')
-                cuda_home = os.path.dirname(os.path.dirname(nvcc))
-        except Exception:
-            #Guess #3
-            cuda_home = '/usr/local/cuda'
-            if not os.path.exists(cuda_home):
-                cuda_home = None
-    return cuda_home
-
-CUDA_HOME = _find_cuda_home()
-_BMT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-
-def _join_cuda_home(*paths) -> str:
-    if CUDA_HOME is None:
-        raise EnvironmentError('CUDA_HOME environment variable is not set. '
-                               'Please set it to your CUDA install root.')
-    return os.path.join(CUDA_HOME, *paths)
-
-def CppExtension(name, sources, *args, **kwargs):
-    library_dirs = kwargs.get('library_dirs', [])
-    library_dirs += library_paths(cuda=True)
-    kwargs['library_dirs'] = library_dirs
-
-    libraries = kwargs.get('libraries', [])
-    libraries.append('cudart')
-    kwargs['libraries'] = libraries
-
-    include_dirs = kwargs.get('include_dirs', [])
-
-    include_dirs += include_paths(cuda=True)
-    kwargs['include_dirs'] = include_dirs
-
-    kwargs['language'] = 'c++'
-
-    dlink_libraries = kwargs.get('dlink_libraries', [])
-    dlink = kwargs.get('dlink', False) or dlink_libraries
-    if dlink:
-        extra_compile_args = kwargs.get('extra_compile_args', {})
-
-        extra_compile_args_dlink = extra_compile_args.get('nvcc_dlink', [])
-        extra_compile_args_dlink += ['-dlink']
-        extra_compile_args_dlink += [f'-L{x}' for x in library_dirs]
-        extra_compile_args_dlink += [f'-l{x}' for x in dlink_libraries]
-
-        extra_compile_args['nvcc_dlink'] = extra_compile_args_dlink
-
-        kwargs['extra_compile_args'] = extra_compile_args
-
-    return setuptools.Extension(name, sources, *args, **kwargs)
-
-
-def include_paths(cuda: bool = False) -> List[str]:
-    paths = [
-        os.path.join(_BMT_PATH, 'csrc', 'include'),
-    ]
-    if cuda:
-        cuda_home_include = _join_cuda_home('include')
-        # if we have the Debian/Ubuntu packages for cuda, we get /usr as cuda home.
-        # but gcc doesn't like having /usr/include passed explicitly
-        if cuda_home_include != '/usr/include':
-            paths.append(cuda_home_include)
-    return paths
-
-
-def library_paths(cuda: bool = False) -> List[str]:
-    paths = []
-
-    if cuda:
-        lib_dir = 'lib64'
-        if (not os.path.exists(_join_cuda_home(lib_dir)) and
-                os.path.exists(_join_cuda_home('lib'))):
-            lib_dir = 'lib'
-
-        paths.append(_join_cuda_home(lib_dir))
-    return paths
-
-def get_avx_flags():
-    if os.environ.get("BMT_AVX256", "").lower() in ["1", "true", "on"]:
-        return ["-mavx", "-mfma", "-mf16c"]
-    elif os.environ.get("BMT_AVX512", "").lower() in ["1", "true", "on"]:
-        return ["-mavx512f"]
-    else:
-        return ["-march=native"]
-
-def get_device_cc():
-    try:
-        CC_SET = set()
-        for i in range(torch.cuda.device_count()):
-            CC_SET.add(torch.cuda.get_device_capability(i))
-        
-        if len(CC_SET) == 0:
-            return None
-        
-        ret = ""
-        for it in CC_SET:
-            if len(ret) > 0:
-                ret = ret + " "
-            ret = ret + ("%d.%d" % it)
-        return ret
-    except RuntimeError:
-        return None
-
-avx_flag = get_avx_flags()
-device_cc = get_device_cc()
-if device_cc is None:
-    if not torch.cuda.is_available():
-        os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0+PTX")
-    else:
-        if torch.version.cuda.startswith("10"):
-            os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5+PTX")
+            subprocess.check_call('ninja --version'.split(), stdout=devnull)
+        except OSError:
+            return False
         else:
-            if not torch.version.cuda.startswith("11.0"):
-                os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0 8.6+PTX")
-            else:
-                os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0+PTX")
-else:
-    os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", device_cc)
-
-ext_modules = []
-
-if os.environ.get("GITHUB_ACTIONS", "false") == "false":
-    ext_modules = [
-        CUDAExtension('bmtrain.nccl._C', [
-            'csrc/nccl.cpp',
-        ], include_dirs=["csrc/nccl/build/include"], extra_compile_args={}, extra_link_args=["../libnccl_static.a"]),
-        CUDAExtension('bmtrain.optim._cuda', [
-            'csrc/adam_cuda.cpp',
-            'csrc/cuda/adam.cu',
-            'csrc/cuda/has_inf_nan.cu'
-        ], extra_compile_args={}),
-        CppExtension("bmtrain.optim._cpu", [
-            "csrc/adam_cpu.cpp",
-        ], extra_compile_args=[
-            '-fopenmp', 
-            *avx_flag
-        ], extra_link_args=['-lgomp']),
-        CUDAExtension('bmtrain.loss._cuda', [
-            'csrc/cross_entropy_loss.cpp',
-            'csrc/cuda/cross_entropy.cu',
-        ], extra_compile_args={}),
-    ]
-else:
-    ext_modules = []
-
+            return True
+
+
+class CMakeBuild(build_ext):
+ 
+    def build_extension(self, ext):
+        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+
+        # required for auto-detection & inclusion of auxiliary "native" libs
+        if not extdir.endswith(os.path.sep):
+            extdir += os.path.sep
+
+        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
+        cfg = "Debug" if debug else "Release"
+
+        # CMake lets you override the generator - we need to check this.
+        # Can be set with Conda-Build, for example.
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+
+        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
+        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
+        # from Python.
+        cmake_args = [
+	        f"-DCMAKE_CXX_STANDARD=14",
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}",
+            f"-DPYTHON_EXECUTABLE={sys.executable}",
+            f"-DPYTHON_VERSION={sys.version_info.major}.{sys.version_info.minor}",
+            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
+        ]
+
+        build_args = []
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+
+        cmake_args += [f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"]
+
+
+        if not cmake_generator or cmake_generator == "Ninja":
+            try:
+                import ninja  # noqa: F401
+
+                ninja_executable_path = os.path.join(ninja.BIN_DIR, "ninja")
+                cmake_args += [
+                    "-GNinja",
+                    f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
+                ]
+            except ImportError:
+                pass
+
+        if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+            # self.parallel is a Python 3 only way to set parallel jobs by hand
+            # using -j in the build_ext call, not supported by pip or PyPA-build.
+            if hasattr(self, "parallel") and self.parallel:
+                # CMake 3.12+ only.
+                build_args += [f"-j{self.parallel}"]
+
+        build_temp = os.path.join(self.build_temp, ext.name)
+        if not os.path.exists(build_temp):
+            os.makedirs(build_temp)
+
+        cmake_args += ["-DPython_ROOT_DIR=" + os.path.dirname(os.path.dirname(sys.executable))]
+        subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=build_temp)
+        subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=build_temp)
+        
+ext_modules = [
+    CMakeExtension("bmtrain.C"),
+]
 setup(
     name='bmtrain',
     version='0.2.2',
@@ -223,6 +109,6 @@ def get_device_cc():
     ],
     ext_modules=ext_modules,
     cmdclass={
-        'build_ext': build_ext
+        'build_ext': CMakeBuild
     })
 

From 6f07c3725d8f5237092674bbbfd02f4e67be5eec Mon Sep 17 00:00:00 2001
From: maydomine <1583143678@qq.com>
Date: Mon, 12 Jun 2023 17:51:23 +0800
Subject: [PATCH 05/13] fix some legacy bug

---
 bmtrain/optim/_function.py     | 2 +-
 bmtrain/optim/optim_manager.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/bmtrain/optim/_function.py b/bmtrain/optim/_function.py
index 342998fa..5ed9079a 100644
--- a/bmtrain/optim/_function.py
+++ b/bmtrain/optim/_function.py
@@ -54,7 +54,7 @@ def adam(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tenso
     assert param_fp32.dtype == torch.float32, "param_fp32 must be float32 tensor"
     assert param_fp16.dtype == torch.float16, "param_fp16 must be float16 tensor"
     assert g_fp16.dtype == torch.float16, "g_fp16 must be float16 tensor"
-    assert m_fp16.dtype == torch.float32, "m_fp32 must be float32 tensor"
+    assert m_fp16.dtype == torch.float16, "m_fp16 must be float16 tensor"
     assert v_fp32.dtype == torch.float32, "v_fp32 must be float32 tensor"
     assert param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements"
     assert param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements"
diff --git a/bmtrain/optim/optim_manager.py b/bmtrain/optim/optim_manager.py
index e0943ca0..d387faa9 100644
--- a/bmtrain/optim/optim_manager.py
+++ b/bmtrain/optim/optim_manager.py
@@ -1,7 +1,6 @@
 from typing import Optional, Union, List, Dict, Tuple
 import torch
-
-from .. import C 
+from ..loss._function import has_inf_nan
 from ..utils import print_rank
 from ..lr_scheduler.warmup import WarmupLRScheduler
 from .. import nccl
@@ -13,7 +12,7 @@ def check_overflow(param_groups):
     for group in param_groups:
         for p in group['params']:
             if p.grad is not None and p.dtype == torch.half: # TODO support other types
-                C.f_has_inf_nan(p.grad, has_inf_or_nan)
+                has_inf_nan(p.grad, has_inf_or_nan)
 
     if "comm" in config:
         nccl.allReduce(has_inf_or_nan.storage(), has_inf_or_nan.storage(), "max", config["comm"])
@@ -81,6 +80,7 @@ def add_optimizer(
         self.lr_schedulers.append(lr_scheduler)
 
     def scale_loss(self, loss : torch.Tensor) -> torch.Tensor:
+
         return loss * (self.loss_scale / config['world_size']) # loss scale
 
     def backward(self, loss : torch.Tensor):
@@ -128,6 +128,7 @@ def step(self):
                 
         for optimizer, lr_scheduler in zip(self.optimizers, self.lr_schedulers):
             if hasattr(optimizer, "_bmtrain_optimizer") and optimizer._bmtrain_optimizer:
+                print("BMTrain optimizer step")
                 optimizer.step(scale=self.loss_scale)
             else:
                 if self.loss_scale_enabled:

From 50e0f5b9dfc5d4ebcd467008b62b34ea7a793642 Mon Sep 17 00:00:00 2001
From: maydomine <1583143678@qq.com>
Date: Tue, 13 Jun 2023 10:53:03 +0800
Subject: [PATCH 06/13] fix the grad overflow caused by torch2.0.0

---
 bmtrain/optim/adam.py          | 6 +++++-
 bmtrain/optim/adam_offload.py  | 7 ++++++-
 bmtrain/optim/optim_manager.py | 8 ++++----
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/bmtrain/optim/adam.py b/bmtrain/optim/adam.py
index 4cdb7914..b63a4f51 100644
--- a/bmtrain/optim/adam.py
+++ b/bmtrain/optim/adam.py
@@ -177,4 +177,8 @@ def update_group(group, new_group):
             return new_group
         param_groups = [
             update_group(g, ng) for g, ng in zip(groups, saved_groups)]
-        self.__setstate__({'state': state, 'param_groups': param_groups})
\ No newline at end of file
+        self.__setstate__({'state': state, 'param_groups': param_groups})
+
+    #TODO zero_grad(set_to_none=True) makes optimizer crashed, maybe the reason of grad accu
+    def zero_grad(self, set_to_none: bool = False):
+        super().zero_grad(set_to_none=set_to_none)
\ No newline at end of file
diff --git a/bmtrain/optim/adam_offload.py b/bmtrain/optim/adam_offload.py
index ecf416f6..e33219bf 100644
--- a/bmtrain/optim/adam_offload.py
+++ b/bmtrain/optim/adam_offload.py
@@ -250,4 +250,9 @@ def cut_states(state):
         return {
             'state': packed_state,
             'param_groups': param_groups,
-        }
\ No newline at end of file
+        }
+            
+    #TODO zero_grad(set_to_none=True) makes optimizer crashed, maybe the reason of grad accu
+    def zero_grad(self, set_to_none: bool = False):
+        super().zero_grad(set_to_none=set_to_none)
+
diff --git a/bmtrain/optim/optim_manager.py b/bmtrain/optim/optim_manager.py
index d387faa9..8d8161f8 100644
--- a/bmtrain/optim/optim_manager.py
+++ b/bmtrain/optim/optim_manager.py
@@ -101,7 +101,7 @@ def zero_grad(self):
         This is a helper function to call optimizer.zero_grad()
         """
         for optimizer in self.optimizers:
-            optimizer.zero_grad()
+            optimizer.zero_grad(set_to_none=False)
 
     def step(self):
         """
@@ -122,10 +122,10 @@ def step(self):
                     break
             if has_overflow:
                 print_rank("Gradient overflow, change scale from %lf to %lf" % (self.loss_scale, self.loss_scale / self.loss_scale_factor))
-                self._justify_scale(self.loss_scale / self.loss_scale_factor)
-                self.zero_grad()
+                with torch.no_grad():
+                    self._justify_scale(self.loss_scale / self.loss_scale_factor)
+                    self.zero_grad()
                 return
-                
         for optimizer, lr_scheduler in zip(self.optimizers, self.lr_schedulers):
             if hasattr(optimizer, "_bmtrain_optimizer") and optimizer._bmtrain_optimizer:
                 print("BMTrain optimizer step")

From 88a8f90a3d0f1713ef1643952601ec16fc579cee Mon Sep 17 00:00:00 2001
From: maydomine <1583143678@qq.com>
Date: Tue, 13 Jun 2023 11:21:26 +0800
Subject: [PATCH 07/13] legacy debug statement

---
 bmtrain/optim/optim_manager.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bmtrain/optim/optim_manager.py b/bmtrain/optim/optim_manager.py
index 8d8161f8..23eaa868 100644
--- a/bmtrain/optim/optim_manager.py
+++ b/bmtrain/optim/optim_manager.py
@@ -128,7 +128,6 @@ def step(self):
                 return
         for optimizer, lr_scheduler in zip(self.optimizers, self.lr_schedulers):
             if hasattr(optimizer, "_bmtrain_optimizer") and optimizer._bmtrain_optimizer:
-                print("BMTrain optimizer step")
                 optimizer.step(scale=self.loss_scale)
             else:
                 if self.loss_scale_enabled:

From 63831f33290737dc32b891a50a60c69ca400bec1 Mon Sep 17 00:00:00 2001
From: MayDomine <1583143678@qq.com>
Date: Tue, 13 Jun 2023 16:25:10 +0800
Subject: [PATCH 08/13] fix the adam_offload bug

---
 CMakeLists.txt             | 29 +++++++++++-
 bmtrain/optim/_function.py |  4 +-
 csrc/include/adam_cpu.hpp  | 94 +++++++++++++++++++++-----------------
 3 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a00f3f6d..ffecf462 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,6 @@ enable_language(C)
 enable_language(CXX)
 set(CMAKE_CUDA_ARCHITECTURES "61;62;70;72;75;80")
 enable_language(CUDA)
-
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 set(CMAKE_CUDA_STANDARD 14)
@@ -30,7 +29,34 @@ message (STATUS "CMAKE_INSTALL_RPATH: ${CMAKE_INSTALL_RPATH}")
 file(GLOB_RECURSE SOURCES "csrc/*.cpp")
 file(GLOB_RECURSE CUDA_SOURCES "csrc/cuda/*.cu")
 
+set(BMT_AVX256 $ENV{BMT_AVX256})
+set(BMT_AVX512 $ENV{BMT_AVX512})
+
+# Convert them to lowercase
+string(TOLOWER "${BMT_AVX256}" BMT_AVX256_LOWER)
+string(TOLOWER "${BMT_AVX512}" BMT_AVX512_LOWER)
+
+# Check the flags
+if ("${BMT_AVX256_LOWER}" STREQUAL "1" OR "${BMT_AVX256_LOWER}" STREQUAL "true" OR "${BMT_AVX256_LOWER}" STREQUAL "on")
+    set(AVX_FLAGS "${AVX_FLAGS} -mavx -mfma -mf16c")
+elseif ("${BMT_AVX512_LOWER}" STREQUAL "1" OR "${BMT_AVX512_LOWER}" STREQUAL "true" OR "${BMT_AVX512_LOWER}" STREQUAL "on")
+    set(AVX_FLAGS "${AVX_FLAGS} -mavx512f")
+elseif ("${BMT_AVX256_LOWER}" STREQUAL "" OR "${BMT_AVX512_LOWER}" STREQUAL "")
+    message(WARNING "BMT_AVX256 or BMT_AVX512 is empty or not defined.")
+else()
+    set(AVX_FLAGS "${AVX_FLAGS} -march=native")
+endif()
+
+
+
+# Now you can use AVX_FLAGS variable in your target_compile_options
+# For example, if your target is named 'my_target':
+
+
 pybind11_add_module(C ${SOURCES} ${CUDA_SOURCES})
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
+
 target_link_libraries(C PRIVATE
     "-Wl,-Bsymbolic"
     "-Wl,-Bsymbolic-functions"
@@ -47,6 +73,5 @@ target_include_directories(C
     PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
 )
 
-#STATIC ${bmengine_STATIC_LIBRARIES}
 
 
diff --git a/bmtrain/optim/_function.py b/bmtrain/optim/_function.py
index 5ed9079a..ee4b04a7 100644
--- a/bmtrain/optim/_function.py
+++ b/bmtrain/optim/_function.py
@@ -26,7 +26,6 @@ def adam_cpu(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.T
     assert param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements"
     bias_correction1 = 1 - beta1 ** step
     bias_correction2 = 1 - beta2 ** step
-    stream = torch.cuda.current_stream().cuda_stream
     C.adam_cpu_launcher(
                     param_fp32.numel(),
                     param_fp32.data_ptr(),
@@ -40,7 +39,6 @@ def adam_cpu(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.T
                     weight_decay,
                     bias_correction1,
                     bias_correction2,
-                    stream
                 )
 
 def adam(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tensor, m_fp16: torch.Tensor,
@@ -77,4 +75,4 @@ def adam(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tenso
         bias_correction1,
         bias_correction2,
         stream
-    )
\ No newline at end of file
+    )
diff --git a/csrc/include/adam_cpu.hpp b/csrc/include/adam_cpu.hpp
index 1c263b65..dd283bb6 100644
--- a/csrc/include/adam_cpu.hpp
+++ b/csrc/include/adam_cpu.hpp
@@ -3,7 +3,6 @@
 #include <cmath>
 #include <cstdint>
 
-#include <pybind11/pybind11.h>
 
 #include<cuda_fp16.h>
 #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
@@ -42,18 +41,27 @@ template <class F>
 inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F& f) {
     // Number of iterations
     int64_t numiter = end - begin;
-    
+
     // Number of threads to use
-    int64_t num_threads = std::max(numiter / grain_size, static_cast<int64_t>(1));
+    int64_t num_threads = 1;  // Default to serial execution
+
+    if (grain_size > 0) {
+        num_threads = std::max(numiter / grain_size, static_cast<int64_t>(1));
+    }
+    else{
+        int64_t n = std::thread::hardware_concurrency();
+        grain_size = std::max(numiter / n, static_cast<int64_t>(1));
+        num_threads = std::max(numiter / grain_size, static_cast<int64_t>(1));
+    }
 
     // Check if parallel execution is feasible
     if (num_threads > 1) {
         std::vector<std::thread> threads(num_threads);
         for (int64_t t = 0; t < num_threads; ++t) {
             threads[t] = std::thread([&, t]() {
-                int64_t start = begin + t * grain_size;
-                int64_t end = std::min(begin + (t + 1) * grain_size, end);
-                f(start, end);
+                int64_t left = begin + t * grain_size;
+                int64_t right = std::min(begin + (t + 1) * grain_size, end);
+                f(left, right);
             });
         }
         // Join all threads
@@ -61,12 +69,13 @@ inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F
             thread.join();
         }
     } else {
-        // If not feasible, perform the operation serially
+        // If not feasible or grain_size is 0, perform the operation serially
         f(begin, end);
     }
 }
 
 
+
 inline uint16_t fp16_ieee_from_fp32_value(float f) {
     // const float scale_to_inf = 0x1.0p+112f;
     // const float scale_to_zero = 0x1.0p-110f;
@@ -78,11 +87,7 @@ inline uint16_t fp16_ieee_from_fp32_value(float f) {
     const float scale_to_inf = scale_to_inf_val;
     const float scale_to_zero = scale_to_zero_val;
 
-#if defined(_MSC_VER) && _MSC_VER == 1916
-          float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero;
-#else
-          float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
 
           const uint32_t w = (uint32_t)fp32_to_bits(f);
           const uint32_t shl1_w = w + w;
@@ -125,12 +130,12 @@ inline float fp16_ieee_to_fp32_value(uint16_t h) {
   return  fp32_from_bits(result);
 }
 void adam_cpu_launcher(
-    int n,
-    float* param_fp32,
-    u_int16_t* param_fp16,
-    u_int16_t* g_fp16,
-    float* m_fp32,
-    float* v_fp32,
+    int64_t n,
+    std::uintptr_t param_fp32,
+    std::uintptr_t param_fp16,
+    std::uintptr_t g_fp16,
+    std::uintptr_t m_fp32,
+    std::uintptr_t v_fp32,
     float beta1, float beta2, 
     float eps, float lr, 
     float scale, 
@@ -138,6 +143,11 @@ void adam_cpu_launcher(
     float bias_correction1,
     float bias_correction2
 ) {
+    auto param_fp32_ptr = reinterpret_cast<float*>(param_fp32);
+    auto m_fp32_ptr = reinterpret_cast<float*>(m_fp32);
+    auto v_fp32_ptr = reinterpret_cast<float*>(v_fp32);
+    auto p_fp16_ptr = reinterpret_cast<uint16_t*>(param_fp16);
+    auto g_fp16_ptr  = reinterpret_cast<uint16_t*>(g_fp16);
 #if defined(__AVX512__)
     auto avx_beta1 = _mm512_set1_ps(beta1);
     auto avx_beta2 = _mm512_set1_ps(beta2);
@@ -175,26 +185,26 @@ void adam_cpu_launcher(
 #endif
                 // No AVX or n is not alinged
                 for (int64_t i = j; i < end; i++) {
-                    float g = fp16_ieee_to_fp32_value(g_fp16[i]) / scale;
-                    float m = m_fp32[i];
-                    float v = v_fp32[i];
-                    float p = param_fp32[i];
+                    float g = fp16_ieee_to_fp32_value(g_fp16_ptr[i]) / scale;
+                    float m = m_fp32_ptr[i];
+                    float v = v_fp32_ptr[i];
+                    float p = param_fp32_ptr[i];
                     m = beta1 * m + (1 - beta1) * g;
                     v = beta2 * v + (1 - beta2) * g * g;
                     p = p - lr * m  / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p;
-                    param_fp32[i] = p;
-                    param_fp16[i] = fp16_ieee_from_fp32_value(p);
-                    m_fp32[i] = m;
-                    v_fp32[i] = v;
+                    param_fp32_ptr[i] = p;
+                    p_fp16_ptr[i] = fp16_ieee_from_fp32_value(p);
+                    m_fp32_ptr[i] = m;
+                    v_fp32_ptr[i] = v;
                 }
                 break; // must break here
             } else {
                 // use AVX here
 #if defined(__AVX512__)
-                auto g = _mm512_div_ps(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)&g_fp16[j])), avx_scale);
-                auto m = _mm512_loadu_ps(&m_fp32[j]);
-                auto v = _mm512_loadu_ps(&v_fp32[j]);
-                auto p = _mm512_loadu_ps(&param_fp32[j]);
+                auto g = _mm512_div_ps(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)&g_fp16_ptr[j])), avx_scale);
+                auto m = _mm512_loadu_ps(&m_fp32_ptr[j]);
+                auto v = _mm512_loadu_ps(&v_fp32_ptr[j]);
+                auto p = _mm512_loadu_ps(&param_fp32_ptr[j]);
                 m = _mm512_fmadd_ps(avx_beta1, m, _mm512_mul_ps(avx_beta1_1, g));
                 v = _mm512_fmadd_ps(avx_beta2, v, _mm512_mul_ps(avx_beta2_1, _mm512_mul_ps(g, g)));
                 p = _mm512_fmadd_ps(avx_neg_lr, _mm512_mul_ps(avx_weight_decay, p), p); // p = p - lr * weight_decay * p
@@ -209,15 +219,15 @@ void adam_cpu_launcher(
                     ),
                     p
                 );  // p = p - lr * m / bias_correction1 / (sqrtf(v / bias_correction2) + eps)
-                _mm512_storeu_ps(&param_fp32[j], p);
-                _mm256_storeu_si256((__m256i*)&param_fp16[j], _mm512_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
-                _mm512_storeu_ps(&m_fp32[j], m);
-                _mm512_storeu_ps(&v_fp32[j], v);
+                _mm512_storeu_ps(&param_fp32_ptr[j], p);
+                _mm256_storeu_si256((__m256i*)&p_fp16_ptr[j], _mm512_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+                _mm512_storeu_ps(&m_fp32_ptr[j], m);
+                _mm512_storeu_ps(&v_fp32_ptr[j], v);
 #elif defined(__AVX256__)
-                auto g = _mm256_div_ps(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)&g_fp16[j])), avx_scale);
-                auto m = _mm256_loadu_ps(&m_fp32[j]);
-                auto v = _mm256_loadu_ps(&v_fp32[j]);
-                auto p = _mm256_loadu_ps(&param_fp32[j]);
+                auto g = _mm256_div_ps(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)&g_fp16_ptr[j])), avx_scale);
+                auto m = _mm256_loadu_ps(&m_fp32_ptr[j]);
+                auto v = _mm256_loadu_ps(&v_fp32_ptr[j]);
+                auto p = _mm256_loadu_ps(&param_fp32_ptr[j]);
                 m = _mm256_fmadd_ps(avx_beta1, m, _mm256_mul_ps(avx_beta1_1, g));
                 v = _mm256_fmadd_ps(avx_beta2, v, _mm256_mul_ps(avx_beta2_1, _mm256_mul_ps(g, g)));
                 p = _mm256_fmadd_ps(avx_neg_lr, _mm256_mul_ps(avx_weight_decay, p), p); // p = p - lr * weight_decay * p
@@ -229,10 +239,10 @@ void adam_cpu_launcher(
                     ),  // m / bias_correction1 / (sqrt(v / bias_correction2) + eps)
                     p
                 );  // p = p - lr * m / bias_correction1 / (sqrt(v / bias_correction2) + eps)
-                _mm256_storeu_ps(&param_fp32[j], p);
-                _mm_storeu_si128((__m128i*)&param_fp16[j], _mm256_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
-                _mm256_storeu_ps(&m_fp32[j], m);
-                _mm256_storeu_ps(&v_fp32[j], v);
+                _mm256_storeu_ps(&param_fp32_ptr[j], p);
+                _mm_storeu_si128((__m128i*)&p_fp16_ptr[j], _mm256_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+                _mm256_storeu_ps(&m_fp32_ptr[j], m);
+                _mm256_storeu_ps(&v_fp32_ptr[j], v);
 #endif
             }
         }

From bb5f0aecd85242f81d2c0970c7a3fe6ffb9a7936 Mon Sep 17 00:00:00 2001
From: MayDomine <1583143678@qq.com>
Date: Wed, 14 Jun 2023 07:08:24 +0000
Subject: [PATCH 09/13] modified the setup.py and CMakeList.txt to simpify the
 compiling process and add github action config

git action workflow
---
 .github/workflows/publish.yaml |  83 +++++--
 .github/workflows/release.yml  |  71 ++++++
 CMakeLists.txt                 |   7 +-
 Release.txt                    |   3 +
 cmake/FindNCCL.cmake           |  27 ++-
 csrc/include/nccl.h            | 398 ---------------------------------
 csrc/include/nccl.hpp          |   2 +-
 csrc/include/nccl_net.h        | 313 --------------------------
 setup.py                       |   8 +-
 9 files changed, 171 insertions(+), 741 deletions(-)
 create mode 100644 .github/workflows/release.yml
 create mode 100644 Release.txt
 delete mode 100644 csrc/include/nccl.h
 delete mode 100644 csrc/include/nccl_net.h

diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 43d8d52a..4f40bf66 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -1,26 +1,65 @@
-name: "Publish to PyPI"
+name: Build and Publish to PyPI
+
 on:
-  release:
-    types:
-      - published
+  push:
+    branches:
+      - main
+
 jobs:
-  build-n-publish:
-    name: Build and publish
+  build:
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['36', '37', '38', '39']
+
     steps:
-      - uses: actions/checkout@master
-      - uses: actions/setup-python@v2
-        with:
-          python-version: '3.7'
-          architecture: 'x64'
-      - name: Run build script
-        run: |
-          pip install twine --user
-          pip install wheel
-          pip install torch==1.11.0+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
-          python setup.py sdist --format=gztar
-      - name: Publish distribution 📦 to PyPI
-        uses: pypa/gh-action-pypi-publish@master
-        with:
-            user: __token__
-            password: ${{ secrets.pypi_password }}
\ No newline at end of file
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v2
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+    - name: Pull Docker image
+      run: docker pull maydomine/bmtrain-manylinux:cu110
+
+    - name: Run Docker image and execute script
+      run: |
+        version=${{ matrix.python-version }}
+        docker run -e CUDACXX=/usr/local/cuda/bin/nvcc -e PATH="/workspace/cmake-3.26.4-linux-x86_64/bin:/opt/rh/devtoolset-7/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i maydomine/bmtrain-manylinux:cu110 /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/python setup.py bdist_wheel -d ./wheel/;/opt/python/cp${version}*/bin/python setup.py sdist -d ./sdist/;for file in wheel/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"
+        
+    - name: Archive distribution files
+      uses: actions/upload-artifact@v2
+      with:
+        name: dist
+        path: |
+          sdist/*.tar.gz
+          wheel/*.whl
+
+  publish:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Set Up the Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+
+    - name: Install twine
+      run: python -m pip install twine
+
+    - name: Download distribution files
+      uses: actions/download-artifact@v2
+      with:
+        name: dist
+        path: dist
+
+    - name: Publish to PyPI
+      env:
+        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+      run: |
+        cd dist
+        python -m twine upload sdist/*.tar.gz wheel/*.whl
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 00000000..dc3c593e
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,71 @@
+name: Publish release in Github
+
+on:
+  push:
+    tags:
+      - "v*.*.*"
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['36', '37', '38', '39']
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v2
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+    - name: Pull Docker image
+      run: docker pull maydomine/bmtrain-manylinux:cu110
+
+    - name: Run Docker image and execute script
+      run: |
+        version=${{ matrix.python-version }}
+        docker run -e CUDACXX=/usr/local/cuda/bin/nvcc -e PATH="/workspace/cmake-3.26.4-linux-x86_64/bin:/opt/rh/devtoolset-7/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i maydomine/bmtrain-manylinux:cu110 /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/python setup.py bdist_wheel -d ./wheel/;/opt/python/cp${version}*/bin/python setup.py sdist -d ./sdist/;for file in wheel/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"
+      
+    - name: Archive distribution files
+      uses: actions/upload-artifact@v2
+      with:
+        name: dist
+        path: |
+          sdist/*.tar.gz
+          wheel/*.whl
+
+  publish:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set Up the Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+
+    - name: Download distribution files
+      uses: actions/download-artifact@v2
+      with:
+        name: dist
+        path: dist
+
+    - name: Upload Distribution Files
+      uses: softprops/action-gh-release@v1
+      with:
+        body_path: "Release.txt"
+        files: ｜
+          dist/sdist/*.tar.gz
+          dist/wheel/*.whl
+        prerelease: false
+        token: ${{ secrets.RELEASE_TOKEN }}
+        release_tag: ${{ steps.create_release.outputs.tag }}
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+      env:
+        GITHUB_REPOSITORY: MayDomine/BMTrain
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ffecf462..403a4df3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 set(CMAKE_CUDA_STANDARD 14)
 set(CMAKE_CUDA_STANDARD_REQUIRED True)
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80")
 set(CMAKE_BUILD_RPATH $ORIGIN)
 set(CMAKE_INSTALL_RPATH $ORIGIN)
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/)
@@ -20,7 +20,8 @@ execute_process(COMMAND ${Python_EXECUTABLE} "-c"
     "import pybind11; print(pybind11.get_cmake_dir())"
     OUTPUT_VARIABLE PYBIND11_CMAKE_DIR
     OUTPUT_STRIP_TRAILING_WHITESPACE)
-message (STATUS "PYBIND11_CMAKE_DIR: ${PYBIND11_CMAKE_DIR}")
+message (STATUS "PYBIND11_CMAKE_DIR: ${PYB
+IND11_CMAKE_DIR}")
 list(APPEND CMAKE_PREFIX_PATH ${PYBIND11_CMAKE_DIR})
 find_package(pybind11 REQUIRED)
 
@@ -66,7 +67,7 @@ target_include_directories(C PRIVATE ${NCCL_INCLUDE_DIRS})
 target_compile_definitions(C
     PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO})
 
-set_target_properties(C PROPERTIES CUDA_ARCHITECTURES "61;62;70;72;75;80;86")
+set_target_properties(C PROPERTIES CUDA_ARCHITECTURES "61;62;70;72;75;80")
 
 target_include_directories(C
     PRIVATE "csrc/include"
diff --git a/Release.txt b/Release.txt
new file mode 100644
index 00000000..8597044f
--- /dev/null
+++ b/Release.txt
@@ -0,0 +1,3 @@
+# BMTrain New Version Release v0.2.3
+- easier to install (without torch dependency while compiling)
+- compatible with torch 2.0
\ No newline at end of file
diff --git a/cmake/FindNCCL.cmake b/cmake/FindNCCL.cmake
index e6991733..2af8e3b9 100644
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@@ -9,12 +9,31 @@ else()
 endif()
 
 # Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12.
-list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
+if(NOT NCCL_INCLUDE_DIR OR NOT NCCL_LIB_DIR)
+  execute_process(
+    COMMAND python -c "import nvidia.nccl;import os; print(os.path.dirname(nvidia.nccl.__file__))"
+    OUTPUT_VARIABLE NCCL_PIP_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  list(APPEND NCCL_ROOT $ENV{NCCL_PIP_DIR})
+  if(NOT NCCL_INCLUDE_DIR)
+    set(NCCL_INCLUDE_DIR "${NCCL_PIP_DIR}/include")
+  endif()
+  if(NOT NCCL_LIB_DIR)
+    set(NCCL_LIB_DIR "${NCCL_PIP_DIR}/lib")
+  endif()
+  find_library(NCCL_LIBRARIES
+  NAMES ${NCCL_LIBNAME}
+  HINTS ${NCCL_LIB_DIR})
+endif()
 
+list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT})
 find_path(NCCL_INCLUDE_DIRS
   NAMES nccl.h
   HINTS ${NCCL_INCLUDE_DIR})
 
+
+
 if (USE_STATIC_NCCL)
   MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.")
   SET(NCCL_LIBNAME "nccl_static")
@@ -23,15 +42,21 @@ if (USE_STATIC_NCCL)
   endif()
 else()
   SET(NCCL_LIBNAME "nccl")
+  
   if (NCCL_VERSION)  # Prefer the versioned library if a specific NCCL version is specified
+    message(STATUS "NCCL version: ${NCCL_VERSION}")
     set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES})
+  else()
+    set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.2" ${CMAKE_FIND_LIBRARY_SUFFIXES})
   endif()
+
 endif()
 
 find_library(NCCL_LIBRARIES
   NAMES ${NCCL_LIBNAME}
   HINTS ${NCCL_LIB_DIR})
 
+
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES)
 
diff --git a/csrc/include/nccl.h b/csrc/include/nccl.h
deleted file mode 100644
index 90f721c7..00000000
--- a/csrc/include/nccl.h
+++ /dev/null
@@ -1,398 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_H_
-#define NCCL_H_
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#if CUDART_VERSION >= 11000
-#include <cuda_bf16.h>
-#endif
-
-#define NCCL_MAJOR 2
-#define NCCL_MINOR 14
-#define NCCL_PATCH 3
-#define NCCL_SUFFIX ""
-
-#define NCCL_VERSION_CODE 21403
-#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* Opaque handle to communicator */
-typedef struct ncclComm* ncclComm_t;
-
-#define NCCL_UNIQUE_ID_BYTES 128
-typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
-
-/* Error type */
-typedef enum { ncclSuccess                 =  0,
-               ncclUnhandledCudaError      =  1,
-               ncclSystemError             =  2,
-               ncclInternalError           =  3,
-               ncclInvalidArgument         =  4,
-               ncclInvalidUsage            =  5,
-               ncclRemoteError             =  6,
-               ncclInProgress              =  7,
-               ncclNumResults              =  8 } ncclResult_t;
-
-/* Communicator configuration. Users can assign value to attributes to specify the
- * behavior of a communicator. */
-typedef struct ncclConfig_v21400 {
-  /* attributes that users should never touch. */
-  size_t size;
-  unsigned int magic;
-  unsigned int version;
-  /* attributes that users are able to customize. */
-  int blocking;
-} ncclConfig_t;
-
-/* Config initializer must be assigned to initialize config structure when it is created.
- * Not initialized config will result in NCCL error. */
-#define NCCL_CONFIG_INITIALIZER {                                       \
-  sizeof(ncclConfig_t), /* size */                                      \
-  0xcafebeef,           /* magic */                                     \
-  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
-  1                     /* blocking */                                  \
-}
-
-/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
- * This integer is coded with the MAJOR, MINOR and PATCH level of the
- * NCCL library
- */
-ncclResult_t  ncclGetVersion(int *version);
-ncclResult_t pncclGetVersion(int *version);
-
-/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
- * called once and the Id should be distributed to all ranks in the
- * communicator before calling ncclCommInitRank. */
-ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
-ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
-
-/* Create a new communicator (multi thread/process version) with a configuration
- * set by users. */
-ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
-ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
-
-/* Creates a new communicator (multi thread/process version).
- * rank must be between 0 and nranks-1 and unique within a communicator clique.
- * Each rank is associated to a CUDA device, which has to be set before calling
- * ncclCommInitRank.
- * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
- * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
-ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
-ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
-
-/* Creates a clique of communicators (single process version).
- * This is a convenience function to create a single-process communicator clique.
- * Returns an array of ndev newly initialized communicators in comm.
- * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
- * If devlist is NULL, the first ndev CUDA devices are used.
- * Order of devlist defines user-order of processors within the communicator. */
-ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
-ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
-
-/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
- * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
- * when the communicator is globally quiescent and related resources are freed; then,
- * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
- * itself) without blocking. */
-ncclResult_t  ncclCommFinalize(ncclComm_t comm);
-ncclResult_t pncclCommFinalize(ncclComm_t comm);
-
-/* Frees local resources associated with communicator object. */
-ncclResult_t  ncclCommDestroy(ncclComm_t comm);
-ncclResult_t pncclCommDestroy(ncclComm_t comm);
-
-/* Frees resources associated with communicator object and aborts any operations
- * that might still be running on the device. */
-ncclResult_t  ncclCommAbort(ncclComm_t comm);
-ncclResult_t pncclCommAbort(ncclComm_t comm);
-
-/* Returns a string for each error code. */
-const char*  ncclGetErrorString(ncclResult_t result);
-const char* pncclGetErrorString(ncclResult_t result);
-
-/* Returns a human-readable message of the last error that occurred.
- * comm is currently unused and can be set to NULL
- */
-const char*  ncclGetLastError(ncclComm_t comm);
-const char* pncclGetError(ncclComm_t comm);
-
-/* Checks whether the comm has encountered any asynchronous errors */
-ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
-ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
-
-/* Gets the number of ranks in the communicator clique. */
-ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
-ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
-
-/* Returns the cuda device number associated with the communicator. */
-ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
-ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
-
-/* Returns the user-ordered "rank" associated with the communicator. */
-ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
-ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
-
-/* Reduction operation selector */
-typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
-typedef enum { ncclSum        = 0,
-               ncclProd       = 1,
-               ncclMax        = 2,
-               ncclMin        = 3,
-               ncclAvg        = 4,
-               /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
-                * serves as the least possible value for dynamic ncclRedOp_t's
-                * as constructed by ncclRedOpCreate*** functions. */
-               ncclNumOps     = 5,
-               /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
-                * It is defined to be the largest signed value (since compilers
-                * are permitted to use signed enums) that won't grow
-                * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
-                * maintain ABI compatibility. */
-               ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
-             } ncclRedOp_t;
-
-/* Data types */
-typedef enum { ncclInt8       = 0, ncclChar       = 0,
-               ncclUint8      = 1,
-               ncclInt32      = 2, ncclInt        = 2,
-               ncclUint32     = 3,
-               ncclInt64      = 4,
-               ncclUint64     = 5,
-               ncclFloat16    = 6, ncclHalf       = 6,
-               ncclFloat32    = 7, ncclFloat      = 7,
-               ncclFloat64    = 8, ncclDouble     = 8,
-#if defined(__CUDA_BF16_TYPES_EXIST__)
-               ncclBfloat16   = 9,
-               ncclNumTypes   = 10
-#else
-               ncclNumTypes   = 9
-#endif
-} ncclDataType_t;
-
-/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
-typedef enum {
-  /* ncclScalarDevice: The scalar is in device-visible memory and will be
-   * dereferenced while the collective is running. */
-  ncclScalarDevice = 0,
-
-  /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
-   * dereferenced before the ncclRedOpCreate***() function returns. */
-  ncclScalarHostImmediate = 1
-} ncclScalarResidence_t;
-
-/*
- * ncclRedOpCreatePreMulSum
- *
- * Creates a new reduction operator which pre-multiplies input values by a given
- * scalar locally before reducing them with peer values via summation. For use
- * only with collectives launched against *comm* and *datatype*. The
- * *residence* argument indicates how/when the memory pointed to by *scalar*
- * will be dereferenced. Upon return, the newly created operator's handle
- * is stored in *op*.
- */
-ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
-ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
-
-/*
- * ncclRedOpDestroy
- *
- * Destroys the reduction operator *op*. The operator must have been created by
- * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
- * destroyed as soon as the last NCCL function which is given that operator returns.
- */
-ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
-ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
-
-/*
- * Collective communication operations
- *
- * Collective communication operations must be called separately for each
- * communicator in a communicator clique.
- *
- * They return when operations have been enqueued on the CUDA stream.
- *
- * Since they may perform inter-CPU synchronization, each call has to be done
- * from a different thread or process, or need to use Group Semantics (see
- * below).
- */
-
-/*
- * Reduce
- *
- * Reduces data arrays of length count in sendbuff into recvbuff using op
- * operation.
- * recvbuff may be NULL on all calls except for root device.
- * root is the rank (not the CUDA device) where data will reside after the
- * operation is complete.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
-    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
-
-/*
- * (deprecated) Broadcast (in-place)
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * This operation is implicitely in place.
- */
-ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-
-/*
- * Broadcast
- *
- * Copies count values from root to all other devices.
- * root is the rank (not the CUDA device) where data resides before the
- * operation is started.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
-    ncclComm_t comm, cudaStream_t stream);
-
-/*
- * All-Reduce
- *
- * Reduces data arrays of length count in sendbuff using op operation, and
- * leaves identical copies of result on each recvbuff.
- *
- * In-place operation will happen if sendbuff == recvbuff.
- */
-ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
-    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
-
-/*
- * Reduce-Scatter
- *
- * Reduces data in sendbuff using op operation and leaves reduced result
- * scattered over the devices so that recvbuff on rank i will contain the i-th
- * block of the result.
- * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
- * should have a size of at least nranks*recvcount elements.
- *
- * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
- */
-ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
-    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
-ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
-    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-    cudaStream_t stream);
-
-/*
- * All-Gather
- *
- * Each device gathers sendcount values from other GPUs into recvbuff,
- * receiving data from rank i at offset i*sendcount.
- * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
- * should have a size of at least nranks*sendcount elements.
- *
- * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
- */
-ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
-    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
-
-/*
- * Send
- *
- * Send data from sendbuff to rank peer.
- *
- * Rank peer needs to call ncclRecv with the same datatype and the same count from this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
- * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
- * ncclGroupEnd section.
- */
-ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-
-/*
- * Receive
- *
- * Receive data from rank peer into recvbuff.
- *
- * Rank peer needs to call ncclSend with the same datatype and the same count to this
- * rank.
- *
- * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
- * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
- * ncclGroupEnd section.
- */
-ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
-    ncclComm_t comm, cudaStream_t stream);
-
-/*
- * Group semantics
- *
- * When managing multiple GPUs from a single thread, and since NCCL collective
- * calls may perform inter-CPU synchronization, we need to "group" calls for
- * different ranks/devices into a single call.
- *
- * Grouping NCCL calls as being part of the same collective operation is done
- * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
- * collective calls until the ncclGroupEnd call, which will wait for all calls
- * to be complete. Note that for collective communication, ncclGroupEnd only
- * guarantees that the operations are enqueued on the streams, not that
- * the operation is effectively done.
- *
- * Both collective communication and ncclCommInitRank can be used in conjunction
- * of ncclGroupStart/ncclGroupEnd, but not together.
- *
- * Group semantics also allow to fuse multiple operations on the same device
- * to improve performance (for aggregated collective calls), or to permit
- * concurrent progress of multiple send/receive operations.
- */
-
-/*
- * Group Start
- *
- * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
- * a single NCCL operation. Nothing will be started on the CUDA stream until
- * ncclGroupEnd.
- */
-ncclResult_t  ncclGroupStart();
-ncclResult_t pncclGroupStart();
-
-/*
- * Group End
- *
- * End a group call. Start a fused NCCL operation consisting of all calls since
- * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
- * need to be called after ncclGroupEnd.
- */
-ncclResult_t  ncclGroupEnd();
-ncclResult_t pncclGroupEnd();
-
-#ifdef __cplusplus
-} // end extern "C"
-#endif
-
-#endif // end include guard
diff --git a/csrc/include/nccl.hpp b/csrc/include/nccl.hpp
index 297fa5ab..bba0278b 100644
--- a/csrc/include/nccl.hpp
+++ b/csrc/include/nccl.hpp
@@ -3,7 +3,7 @@
 #include <pybind11/pybind11.h>
 
 namespace py = pybind11;
-#include "nccl.h"
+#include <nccl.h>
 
 void checkNCCLStatus(ncclResult_t result) {
     if (result == ncclSuccess) return;
diff --git a/csrc/include/nccl_net.h b/csrc/include/nccl_net.h
deleted file mode 100644
index 255a44ee..00000000
--- a/csrc/include/nccl_net.h
+++ /dev/null
@@ -1,313 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#ifndef NCCL_NET_H_
-#define NCCL_NET_H_
-
-#include "nccl.h"
-#include <stdint.h>
-
-#define NCCL_NET_HANDLE_MAXSIZE 128
-
-#define NCCL_PTR_HOST 0x1
-#define NCCL_PTR_CUDA 0x2
-#define NCCL_PTR_DMABUF 0x4
-
-// Maximum number of requests per comm object
-#define NCCL_NET_MAX_REQUESTS 8
-
-typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
-typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;
-
-typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
-
-typedef struct {
-  char* name;     // Used mostly for logging.
-  char* pciPath;  // Path to the PCI device in /sys.
-  uint64_t guid;  // Unique identifier for the NIC chip. Important for
-                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
-  int speed;      // Port speed in Mbps.
-  int port;       // Port number.
-  float latency;  // Network latency
-  int maxComms;   // Maximum number of comms we can create
-  int maxRecvs;   // Maximum number of grouped receives.
-}ncclNetProperties_v6_t;
-
-typedef ncclNetProperties_v6_t ncclNetProperties_t;
-
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v6_t;
-
-typedef ncclNet_v6_t ncclNet_t;
-
-#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6
-
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  /* DMA-BUF support */
-  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v6_t;
-
-typedef ncclCollNet_v6_t ncclCollNet_t;
-
-#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with sendComm == NULL with the expectation that
-  // it will be called again until sendComm != NULL.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connect.
-  // This call must not block for the connection to be established, and instead
-  // should return successfully with recvComm == NULL with the expectation that
-  // it will be called again until recvComm != NULL.
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* sizes);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v5_t;
-
-// v5 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v5_t;
-
-// v4 struct for backwards compatibility
-typedef struct {
-  char* name;     // Used mostly for logging.
-  char* pciPath;  // Path to the PCI device in /sys.
-  uint64_t guid;  // Unique identifier for the NIC chip. Important for
-                  // cards with multiple PCI functions (Physical or virtual).
-  int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA
-  int speed;      // Port speed in Mbps.
-  int port;       // Port number.
-  int maxComms;   // Maximum number of comms we can create
-} ncclNetProperties_v4_t;
-
-// v4 struct for backwards compatibility
-typedef struct {
-  // Name of the network (mainly for logs)
-  const char* name;
-  // Initialize the network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create a connection.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Connect to a handle and return a sending comm object for that peer.
-  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
-  // Finalize connection establishment after remote peer has called connectHandle
-  ncclResult_t (*accept)(void* listenComm, void** recvComm);
-  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
-  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* comm, void* mhandle);
-  // Asynchronous send to a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request);
-  // Asynchronous recv from a peer.
-  // May return request == NULL if the call cannot be performed (or would block)
-  ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free send/recv comm objects
-  ncclResult_t (*closeSend)(void* sendComm);
-  ncclResult_t (*closeRecv)(void* recvComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclNet_v4_t;
-
-// v4 struct for backwards compatibility
-typedef struct {
-  // Name of the collective network (mainly for logs)
-  const char* name;
-  // Initialize the collective network.
-  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
-  // Return the number of adapters capable of doing collective operations.
-  // If ndev returns 0, all other functions might be set to NULL.
-  ncclResult_t (*devices)(int* ndev);
-  // Get various device properties.
-  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props);
-  // Create a receiving object and provide a handle to connect to it. The
-  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
-  // between ranks to create connections.
-  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
-  // Create a group for collective operations. handles have been created
-  // using listen() above. rank indicates caller's rank in the collective network.
-  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
-  // Returns whether a reduction operation on a data type is supported.
-  // 1 for supported, 0 otherwise.
-  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
-  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
-  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
-  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
-  // Performs an asynchronous allreduce operation on the collective group.
-  // May return request == NULL if the call cannot be performed (or would block).
-  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
-      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
-  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
-  // visible to the GPU
-  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
-  // Test whether a request is complete. If size is not NULL, it returns the
-  // number of bytes sent/received.
-  ncclResult_t (*test)(void* request, int* done, int* size);
-  // Close and free collective comm objects
-  ncclResult_t (*closeColl)(void* collComm);
-  ncclResult_t (*closeListen)(void* listenComm);
-} ncclCollNet_v4_t;
-
-#endif // end include guard
diff --git a/setup.py b/setup.py
index 26fcfed3..16eb535d 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,6 @@
-import torch
 import os
 from setuptools.command.build_ext import build_ext
 from setuptools import  setup, find_packages, Extension
-from torch.utils.cpp_extension import BuildExtension
 import setuptools
 import warnings
 import sys
@@ -99,13 +97,17 @@ def build_extension(self, ext):
 ]
 setup(
     name='bmtrain',
-    version='0.2.2',
+    version='0.2.3',
     author="Guoyang Zeng",
     author_email="qbjooo@qq.com",
     description="A toolkit for training big models",
     packages=find_packages(),
     install_requires=[
         "numpy",
+		"nvidia-nccl-cu11>=2.14.3"
+    ],
+    setup_requires=[
+        "nvidia-nccl-cu11>=2.14.3"
     ],
     ext_modules=ext_modules,
     cmdclass={

From 1d2a667ef24b3e709fa50adfaf873a7bd3afb746 Mon Sep 17 00:00:00 2001
From: maydomine <1583143678@qq.com>
Date: Wed, 5 Jul 2023 10:53:07 +0800
Subject: [PATCH 10/13] fix the thread's bug in adam offload

Update publish.yaml to undo pypi release at every push action
---
 .github/workflows/publish.yaml |  4 ++--
 csrc/include/adam_cpu.hpp      | 43 +++++++++++++++++++++++++++-------
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 4f40bf66..8dd8bec7 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -2,8 +2,8 @@ name: Build and Publish to PyPI
 
 on:
   push:
-    branches:
-      - main
+    tags:
+      - "v*.*.*"
 
 jobs:
   build:
diff --git a/csrc/include/adam_cpu.hpp b/csrc/include/adam_cpu.hpp
index dd283bb6..93df3207 100644
--- a/csrc/include/adam_cpu.hpp
+++ b/csrc/include/adam_cpu.hpp
@@ -2,8 +2,9 @@
 #include <immintrin.h>
 #include <cmath>
 #include <cstdint>
-
-
+#include<sched.h>
+#include <pybind11/pybind11.h>
+#include<iostream>
 #include<cuda_fp16.h>
 #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
 
@@ -20,6 +21,7 @@
 #include <vector>
 #include <thread>
 #include <algorithm>
+using namespace std;
 
 inline float fp32_from_bits(uint32_t w) {
     union {
@@ -49,25 +51,48 @@ inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F
         num_threads = std::max(numiter / grain_size, static_cast<int64_t>(1));
     }
     else{
-        int64_t n = std::thread::hardware_concurrency();
-        grain_size = std::max(numiter / n, static_cast<int64_t>(1));
-        num_threads = std::max(numiter / grain_size, static_cast<int64_t>(1));
+        cpu_set_t get;
+        // int64_t n = sched_getaffinity(0, sizeof(get), &get);
+        cpu_set_t cpu_set;
+        CPU_ZERO(&cpu_set);
+        sched_getaffinity(0, sizeof(cpu_set), &cpu_set);
+        num_threads = CPU_COUNT(&cpu_set);
+        grain_size = std::max(numiter / num_threads, static_cast<int64_t>(1));
+        // grain_size = std::max(numiter / n, static_cast<int64_t>(1));
+        // num_threads = std::max(numiter / grain_size, static_cast<int64_t>(1));
     }
 
     // Check if parallel execution is feasible
     if (num_threads > 1) {
+        py::gil_scoped_release release;  // Release the GIL
         std::vector<std::thread> threads(num_threads);
         for (int64_t t = 0; t < num_threads; ++t) {
             threads[t] = std::thread([&, t]() {
-                int64_t left = begin + t * grain_size;
-                int64_t right = std::min(begin + (t + 1) * grain_size, end);
-                f(left, right);
+                cpu_set_t cpu_set;
+                CPU_ZERO(&cpu_set);
+                sched_getaffinity(0, sizeof(cpu_set), &cpu_set);
+                if (CPU_ISSET(t, &cpu_set)) {
+                    int64_t left = begin + t * grain_size;
+                    int64_t right = std::min(begin + (t + 1) * grain_size, end);
+                    f(left, right);
+                }
             });
         }
-        // Join all threads
         for (auto& thread : threads) {
             thread.join();
         }
+        // std::vector<std::thread> threads(num_threads);
+        // for (int64_t t = 0; t < num_threads; ++t) {
+        //     threads[t] = std::thread([&, t]() {
+        //         int64_t left = begin + t * grain_size;
+        //         int64_t right = std::min(begin + (t + 1) * grain_size, end);
+        //         f(left, right);
+        //     });
+        // }
+        // // Join all threads
+        // for (auto& thread : threads) {
+        //     thread.join();
+        // }
     } else {
         // If not feasible or grain_size is 0, perform the operation serially
         f(begin, end);

From 726d0cb244fe08fe7dc04e85a8badda0f646a49a Mon Sep 17 00:00:00 2001
From: maydomine <1583143678@qq.com>
Date: Wed, 5 Jul 2023 11:28:52 +0800
Subject: [PATCH 11/13] inner thread loop bug

---
 csrc/include/adam_cpu.hpp | 28 ++++------------------------
 1 file changed, 4 insertions(+), 24 deletions(-)

diff --git a/csrc/include/adam_cpu.hpp b/csrc/include/adam_cpu.hpp
index 93df3207..6f70463e 100644
--- a/csrc/include/adam_cpu.hpp
+++ b/csrc/include/adam_cpu.hpp
@@ -51,15 +51,12 @@ inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F
         num_threads = std::max(numiter / grain_size, static_cast<int64_t>(1));
     }
     else{
-        cpu_set_t get;
-        // int64_t n = sched_getaffinity(0, sizeof(get), &get);
         cpu_set_t cpu_set;
         CPU_ZERO(&cpu_set);
         sched_getaffinity(0, sizeof(cpu_set), &cpu_set);
         num_threads = CPU_COUNT(&cpu_set);
         grain_size = std::max(numiter / num_threads, static_cast<int64_t>(1));
-        // grain_size = std::max(numiter / n, static_cast<int64_t>(1));
-        // num_threads = std::max(numiter / grain_size, static_cast<int64_t>(1));
+
     }
 
     // Check if parallel execution is feasible
@@ -68,31 +65,14 @@ inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F
         std::vector<std::thread> threads(num_threads);
         for (int64_t t = 0; t < num_threads; ++t) {
             threads[t] = std::thread([&, t]() {
-                cpu_set_t cpu_set;
-                CPU_ZERO(&cpu_set);
-                sched_getaffinity(0, sizeof(cpu_set), &cpu_set);
-                if (CPU_ISSET(t, &cpu_set)) {
-                    int64_t left = begin + t * grain_size;
-                    int64_t right = std::min(begin + (t + 1) * grain_size, end);
-                    f(left, right);
-                }
+                int64_t left = begin + t * grain_size;
+                int64_t right = std::min(begin + (t + 1) * grain_size, end);
+                f(left, right);
             });
         }
         for (auto& thread : threads) {
             thread.join();
         }
-        // std::vector<std::thread> threads(num_threads);
-        // for (int64_t t = 0; t < num_threads; ++t) {
-        //     threads[t] = std::thread([&, t]() {
-        //         int64_t left = begin + t * grain_size;
-        //         int64_t right = std::min(begin + (t + 1) * grain_size, end);
-        //         f(left, right);
-        //     });
-        // }
-        // // Join all threads
-        // for (auto& thread : threads) {
-        //     thread.join();
-        // }
     } else {
         // If not feasible or grain_size is 0, perform the operation serially
         f(begin, end);

From 97bab329ef246aafdb0804a47a507c51098adbcd Mon Sep 17 00:00:00 2001
From: MayDomine <1583143678@qq.com>
Date: Thu, 20 Jul 2023 16:34:18 +0800
Subject: [PATCH 12/13] fix load model oom issue

debug
---
 bmtrain/block_layer.py |  2 ++
 bmtrain/layer.py       |  2 ++
 bmtrain/store.py       | 70 +++++++++++++++++++++++++++++-------------
 3 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/bmtrain/block_layer.py b/bmtrain/block_layer.py
index 652e085a..8647b7ff 100644
--- a/bmtrain/block_layer.py
+++ b/bmtrain/block_layer.py
@@ -492,6 +492,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
             if key in state_dict:
                 # load here
                 input_param = state_dict[key]
+                if input_param.__class__.__name__ == "DistributedTensorWrapper":
+                    input_param = input_param.broadcast()
                 if input_param.shape != it["shape"]:
                     error_msgs.append('size mismatch for {}: copying a param with shape {} from checkpoint, '
                                       'the shape in current model is {}.'
diff --git a/bmtrain/layer.py b/bmtrain/layer.py
index c32018db..ebbef815 100644
--- a/bmtrain/layer.py
+++ b/bmtrain/layer.py
@@ -82,6 +82,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
             key = prefix + name
             if key in state_dict:
                 input_param = state_dict[key]
+                if input_param.__class__.__name__ == "DistributedTensorWrapper":
+                    input_param = input_param.broadcast()
                 # This is used to avoid copying uninitialized parameters into
                 # non-lazy modules, since they dont have the hook to do the checks
                 # in such case, it will error when accessing the .shape attribute.
diff --git a/bmtrain/store.py b/bmtrain/store.py
index c0914f9a..80007a93 100644
--- a/bmtrain/store.py
+++ b/bmtrain/store.py
@@ -136,6 +136,50 @@ def broadcast_object(obj, comm, src = 0):
     return obj
     
 # Must be a Mapping after pytorch 1.12.0
+class DistributedTensorWrapper:
+    def __init__(self, tensor, shape=None):
+        self._dtype = tensor.dtype
+        self._device = tensor.device
+        self.shape = shape
+        self.tensor = tensor
+        
+    def broadcast(self):
+        output_param = torch.empty(self.shape, dtype=self._dtype, device="cuda")
+        if config['rank'] == 0:
+            input_param = self.tensor
+            if input_param.is_cuda:
+                input_param = input_param.clone().contiguous()
+            else:
+                input_param = input_param.cuda().contiguous()
+
+            nccl.broadcast(
+                input_param.storage(),
+                output_param.storage(),
+                0,
+                config['comm']
+            )
+        else:
+            nccl.broadcast(
+                output_param.storage(),
+                output_param.storage(),
+                0,
+                config['comm']
+            )
+        return output_param
+    def copy(self):
+        return self.tensor
+    
+    def __getattribute__(self, name):
+        if name == "tensor" or name == "shape":
+            return object.__getattribute__(self, name)
+        else:
+            try:
+                return object.__getattribute__(self, name)
+            except AttributeError:
+                pass
+
+            return getattr(self.tensor, name)
+
 class DistributedStateDictWrapper(Mapping):
     def __init__(self, state_dict : Dict) -> None:
         self._state_dict = state_dict
@@ -165,30 +209,12 @@ def __getitem__(self, key : str):
         dtype_idx = tmp_shape[1].item()
         shape_list = torch.Size(tmp_shape[2: 2 + shape_list_size].tolist())
 
-        output_param = torch.empty(shape_list, dtype=DTYPE_LIST[dtype_idx], device="cuda")
-        
-        if config['rank'] == 0:
-            input_param : torch.Tensor = self._state_dict[key]
-            if input_param.is_cuda:
-                input_param = input_param.clone().contiguous()
-            else:
-                input_param = input_param.cuda().contiguous()
-
-            nccl.broadcast(
-                input_param.storage(),
-                output_param.storage(),
-                0,
-                config['comm']
-            )
+        if config['rank'] != 0:
+            return DistributedTensorWrapper(torch.tensor([],dtype=DTYPE_LIST[dtype_idx],device="cuda"),shape=shape_list)
         else:
-            nccl.broadcast(
-                output_param.storage(),
-                output_param.storage(),
-                0,
-                config['comm']
-            )
+            return DistributedTensorWrapper(self._state_dict[key],shape=shape_list)
+
         
-        return output_param
         
     def copy(self):
         return self

From 49f277d4a17b5cca36392d6518c860ecb4f36016 Mon Sep 17 00:00:00 2001
From: MayDomine <1583143678@qq.com>
Date: Mon, 24 Jul 2023 11:32:05 +0800
Subject: [PATCH 13/13] choose avx in runtime

---
 CMakeLists.txt            |  30 ++----
 bmtrain/store.py          |   5 +-
 csrc/include/adam_cpu.hpp | 220 +++++++++++++++++++++++++-------------
 csrc/include/cpu_info.h   |  38 +++++++
 4 files changed, 195 insertions(+), 98 deletions(-)
 create mode 100644 csrc/include/cpu_info.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 403a4df3..455f7dbd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,13 @@ set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED True)
 set(CMAKE_CUDA_STANDARD 14)
 set(CMAKE_CUDA_STANDARD_REQUIRED True)
+
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80")
+
+if(NOT DEFINED ENV{BUILD_DOCKER_ENV} OR "$ENV{BUILD_DOCKER_ENV}" STREQUAL "0")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86")
+endif()
+
 set(CMAKE_BUILD_RPATH $ORIGIN)
 set(CMAKE_INSTALL_RPATH $ORIGIN)
 set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/)
@@ -30,29 +36,7 @@ message (STATUS "CMAKE_INSTALL_RPATH: ${CMAKE_INSTALL_RPATH}")
 file(GLOB_RECURSE SOURCES "csrc/*.cpp")
 file(GLOB_RECURSE CUDA_SOURCES "csrc/cuda/*.cu")
 
-set(BMT_AVX256 $ENV{BMT_AVX256})
-set(BMT_AVX512 $ENV{BMT_AVX512})
-
-# Convert them to lowercase
-string(TOLOWER "${BMT_AVX256}" BMT_AVX256_LOWER)
-string(TOLOWER "${BMT_AVX512}" BMT_AVX512_LOWER)
-
-# Check the flags
-if ("${BMT_AVX256_LOWER}" STREQUAL "1" OR "${BMT_AVX256_LOWER}" STREQUAL "true" OR "${BMT_AVX256_LOWER}" STREQUAL "on")
-    set(AVX_FLAGS "${AVX_FLAGS} -mavx -mfma -mf16c")
-elseif ("${BMT_AVX512_LOWER}" STREQUAL "1" OR "${BMT_AVX512_LOWER}" STREQUAL "true" OR "${BMT_AVX512_LOWER}" STREQUAL "on")
-    set(AVX_FLAGS "${AVX_FLAGS} -mavx512f")
-elseif ("${BMT_AVX256_LOWER}" STREQUAL "" OR "${BMT_AVX512_LOWER}" STREQUAL "")
-    message(WARNING "BMT_AVX256 or BMT_AVX512 is empty or not defined.")
-else()
-    set(AVX_FLAGS "${AVX_FLAGS} -march=native")
-endif()
-
-
-
-# Now you can use AVX_FLAGS variable in your target_compile_options
-# For example, if your target is named 'my_target':
-
+set(AVX_FLAGS "${AVX_FLAGS} -march=native")
 
 pybind11_add_module(C ${SOURCES} ${CUDA_SOURCES})
 
diff --git a/bmtrain/store.py b/bmtrain/store.py
index 80007a93..0e0fd7ca 100644
--- a/bmtrain/store.py
+++ b/bmtrain/store.py
@@ -166,6 +166,7 @@ def broadcast(self):
                 config['comm']
             )
         return output_param
+    
     def copy(self):
         return self.tensor
     
@@ -210,9 +211,9 @@ def __getitem__(self, key : str):
         shape_list = torch.Size(tmp_shape[2: 2 + shape_list_size].tolist())
 
         if config['rank'] != 0:
-            return DistributedTensorWrapper(torch.tensor([],dtype=DTYPE_LIST[dtype_idx],device="cuda"),shape=shape_list)
+            return DistributedTensorWrapper(torch.tensor([], dtype=DTYPE_LIST[dtype_idx], device="cuda"), shape=shape_list)
         else:
-            return DistributedTensorWrapper(self._state_dict[key],shape=shape_list)
+            return DistributedTensorWrapper(self._state_dict[key], shape=shape_list)
 
         
         
diff --git a/csrc/include/adam_cpu.hpp b/csrc/include/adam_cpu.hpp
index 6f70463e..b070c7a0 100644
--- a/csrc/include/adam_cpu.hpp
+++ b/csrc/include/adam_cpu.hpp
@@ -6,22 +6,12 @@
 #include <pybind11/pybind11.h>
 #include<iostream>
 #include<cuda_fp16.h>
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-
-#if defined(__AVX512F__)
-
-#pragma message "Using AVX512"
-#define __AVX512__ 1
-
-#elif defined(__AVX__) and defined(__FMA__) and defined(__F16C__)
-
-#pragma message "Using AVX256"
-#define __AVX256__ 1
-#endif
 #include <vector>
 #include <thread>
 #include <algorithm>
-using namespace std;
+#include "cpu_info.h"
+#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
+
 
 inline float fp32_from_bits(uint32_t w) {
     union {
@@ -134,38 +124,56 @@ inline float fp16_ieee_to_fp32_value(uint16_t h) {
                                           : fp32_to_bits(normalized_value));
   return  fp32_from_bits(result);
 }
-void adam_cpu_launcher(
+
+void adam_cpu_0(
     int64_t n,
-    std::uintptr_t param_fp32,
-    std::uintptr_t param_fp16,
-    std::uintptr_t g_fp16,
-    std::uintptr_t m_fp32,
-    std::uintptr_t v_fp32,
+    float* param_fp32_ptr,
+    uint16_t* param_fp16_ptr,
+    uint16_t* g_fp16_ptr,
+    float* m_fp32_ptr,
+    float* v_fp32_ptr,
     float beta1, float beta2, 
     float eps, float lr, 
     float scale, 
     float weight_decay,
     float bias_correction1,
     float bias_correction2
-) {
-    auto param_fp32_ptr = reinterpret_cast<float*>(param_fp32);
-    auto m_fp32_ptr = reinterpret_cast<float*>(m_fp32);
-    auto v_fp32_ptr = reinterpret_cast<float*>(v_fp32);
-    auto p_fp16_ptr = reinterpret_cast<uint16_t*>(param_fp16);
-    auto g_fp16_ptr  = reinterpret_cast<uint16_t*>(g_fp16);
-#if defined(__AVX512__)
-    auto avx_beta1 = _mm512_set1_ps(beta1);
-    auto avx_beta2 = _mm512_set1_ps(beta2);
-    auto avx_beta1_1 = _mm512_set1_ps(1 - beta1);
-    auto avx_beta2_1 = _mm512_set1_ps(1 - beta2);
-    auto avx_eps = _mm512_set1_ps(eps);
-    auto avx_neg_lr = _mm512_set1_ps(-lr);
-    auto avx_scale = _mm512_set1_ps(scale);
-    auto avx_weight_decay = _mm512_set1_ps(weight_decay);
-    auto avx_bias_correction1 = _mm512_set1_ps(bias_correction1);
-    auto avx_bias_correction2 = _mm512_set1_ps(bias_correction2);
-    int64_t span = 16;
-#elif defined(__AVX256__)
+){
+    int64_t span = 1;
+    parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
+    for (int64_t j = start; j < end; j += span) {
+        for (int64_t i = j; i < end; i++) {
+            float g = fp16_ieee_to_fp32_value(g_fp16_ptr[i]) / scale;
+            float m = m_fp32_ptr[i];
+            float v = v_fp32_ptr[i];
+            float p = param_fp32_ptr[i];
+            m = beta1 * m + (1 - beta1) * g;
+            v = beta2 * v + (1 - beta2) * g * g;
+            p = p - lr * m  / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p;
+            param_fp32_ptr[i] = p;
+            param_fp16_ptr[i] = fp16_ieee_from_fp32_value(p);
+            m_fp32_ptr[i] = m;
+            v_fp32_ptr[i] = v;
+                    }
+            break; // must break here
+        }
+        });
+}
+
+static void __attribute__ ((__target__ ("avx,fma,f16c"))) adam_cpu_1(
+    int64_t n,
+    float* param_fp32_ptr,
+    uint16_t* param_fp16_ptr,
+    uint16_t* g_fp16_ptr,
+    float* m_fp32_ptr,
+    float* v_fp32_ptr,
+    float beta1, float beta2, 
+    float eps, float lr, 
+    float scale, 
+    float weight_decay,
+    float bias_correction1,
+    float bias_correction2
+){
     auto avx_beta1 = _mm256_set1_ps(beta1);
     auto avx_beta2 = _mm256_set1_ps(beta2);
     auto avx_beta1_1 = _mm256_set1_ps(1 - beta1);
@@ -177,18 +185,9 @@ void adam_cpu_launcher(
     auto avx_bias_correction1 = _mm256_set1_ps(bias_correction1);
     auto avx_bias_correction2 = _mm256_set1_ps(bias_correction2);
     int64_t span = 8;
-#else
-    int64_t span = 1;
-#endif
-
     parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
         for (int64_t j = start; j < end; j += span) {
-#if defined(__AVX256__) or defined(__AVX512__)
             if (j + span > end) {
-#else
-            if (true) {
-#endif
-                // No AVX or n is not alinged
                 for (int64_t i = j; i < end; i++) {
                     float g = fp16_ieee_to_fp32_value(g_fp16_ptr[i]) / scale;
                     float m = m_fp32_ptr[i];
@@ -198,14 +197,78 @@ void adam_cpu_launcher(
                     v = beta2 * v + (1 - beta2) * g * g;
                     p = p - lr * m  / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p;
                     param_fp32_ptr[i] = p;
-                    p_fp16_ptr[i] = fp16_ieee_from_fp32_value(p);
+                    param_fp16_ptr[i] = fp16_ieee_from_fp32_value(p);
                     m_fp32_ptr[i] = m;
                     v_fp32_ptr[i] = v;
                 }
                 break; // must break here
             } else {
-                // use AVX here
-#if defined(__AVX512__)
+                auto g = _mm256_div_ps(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)&g_fp16_ptr[j])), avx_scale);
+                auto m = _mm256_loadu_ps(&m_fp32_ptr[j]);
+                auto v = _mm256_loadu_ps(&v_fp32_ptr[j]);
+                auto p = _mm256_loadu_ps(&param_fp32_ptr[j]);
+                m = _mm256_fmadd_ps(avx_beta1, m, _mm256_mul_ps(avx_beta1_1, g));
+                v = _mm256_fmadd_ps(avx_beta2, v, _mm256_mul_ps(avx_beta2_1, _mm256_mul_ps(g, g)));
+                p = _mm256_fmadd_ps(avx_neg_lr, _mm256_mul_ps(avx_weight_decay, p), p); // p = p - lr * weight_decay * p
+                p = _mm256_fmadd_ps(
+                    avx_neg_lr,
+                    _mm256_div_ps(
+                        _mm256_div_ps(m, avx_bias_correction1), // m / bias_correction1
+                        _mm256_add_ps(_mm256_sqrt_ps(_mm256_div_ps(v, avx_bias_correction2)), avx_eps)  // sqrt(v / bias_correction2) + eps
+                    ),  // m / bias_correction1 / (sqrt(v / bias_correction2) + eps)
+                    p
+                );  // p = p - lr * m / bias_correction1 / (sqrt(v / bias_correction2) + eps)
+                _mm256_storeu_ps(&param_fp32_ptr[j], p);
+                _mm_storeu_si128((__m128i*)&param_fp16_ptr[j], _mm256_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+                _mm256_storeu_ps(&m_fp32_ptr[j], m);
+                _mm256_storeu_ps(&v_fp32_ptr[j], v);
+            }
+    }});
+}
+
+static void __attribute__ ((__target__ ("avx512f"))) adam_cpu_2(
+    int64_t n,
+    float* param_fp32_ptr,
+    uint16_t* param_fp16_ptr,
+    uint16_t* g_fp16_ptr,
+    float* m_fp32_ptr,
+    float* v_fp32_ptr,
+    float beta1, float beta2, 
+    float eps, float lr, 
+    float scale, 
+    float weight_decay,
+    float bias_correction1,
+    float bias_correction2
+){
+    auto avx_beta1 = _mm512_set1_ps(beta1);
+    auto avx_beta2 = _mm512_set1_ps(beta2);
+    auto avx_beta1_1 = _mm512_set1_ps(1 - beta1);
+    auto avx_beta2_1 = _mm512_set1_ps(1 - beta2);
+    auto avx_eps = _mm512_set1_ps(eps);
+    auto avx_neg_lr = _mm512_set1_ps(-lr);
+    auto avx_scale = _mm512_set1_ps(scale);
+    auto avx_weight_decay = _mm512_set1_ps(weight_decay);
+    auto avx_bias_correction1 = _mm512_set1_ps(bias_correction1);
+    auto avx_bias_correction2 = _mm512_set1_ps(bias_correction2);
+    int64_t span = 16;  
+    parallel_for(0, n, 0, [&](int64_t start, int64_t end) {
+        for (int64_t j = start; j < end; j += span) {
+            if (j + span > end) {
+                for (int64_t i = j; i < end; i++) {
+                    float g = fp16_ieee_to_fp32_value(g_fp16_ptr[i]) / scale;
+                    float m = m_fp32_ptr[i];
+                    float v = v_fp32_ptr[i];
+                    float p = param_fp32_ptr[i];
+                    m = beta1 * m + (1 - beta1) * g;
+                    v = beta2 * v + (1 - beta2) * g * g;
+                    p = p - lr * m  / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p;
+                    param_fp32_ptr[i] = p;
+                    param_fp16_ptr[i] = fp16_ieee_from_fp32_value(p);
+                    m_fp32_ptr[i] = m;
+                    v_fp32_ptr[i] = v;
+                }
+                break; // must break here
+            }else{
                 auto g = _mm512_div_ps(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)&g_fp16_ptr[j])), avx_scale);
                 auto m = _mm512_loadu_ps(&m_fp32_ptr[j]);
                 auto v = _mm512_loadu_ps(&v_fp32_ptr[j]);
@@ -225,34 +288,45 @@ void adam_cpu_launcher(
                     p
                 );  // p = p - lr * m / bias_correction1 / (sqrtf(v / bias_correction2) + eps)
                 _mm512_storeu_ps(&param_fp32_ptr[j], p);
-                _mm256_storeu_si256((__m256i*)&p_fp16_ptr[j], _mm512_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+                _mm256_storeu_si256((__m256i*)&param_fp16_ptr[j], _mm512_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
                 _mm512_storeu_ps(&m_fp32_ptr[j], m);
                 _mm512_storeu_ps(&v_fp32_ptr[j], v);
-#elif defined(__AVX256__)
-                auto g = _mm256_div_ps(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)&g_fp16_ptr[j])), avx_scale);
-                auto m = _mm256_loadu_ps(&m_fp32_ptr[j]);
-                auto v = _mm256_loadu_ps(&v_fp32_ptr[j]);
-                auto p = _mm256_loadu_ps(&param_fp32_ptr[j]);
-                m = _mm256_fmadd_ps(avx_beta1, m, _mm256_mul_ps(avx_beta1_1, g));
-                v = _mm256_fmadd_ps(avx_beta2, v, _mm256_mul_ps(avx_beta2_1, _mm256_mul_ps(g, g)));
-                p = _mm256_fmadd_ps(avx_neg_lr, _mm256_mul_ps(avx_weight_decay, p), p); // p = p - lr * weight_decay * p
-                p = _mm256_fmadd_ps(
-                    avx_neg_lr,
-                    _mm256_div_ps(
-                        _mm256_div_ps(m, avx_bias_correction1), // m / bias_correction1
-                        _mm256_add_ps(_mm256_sqrt_ps(_mm256_div_ps(v, avx_bias_correction2)), avx_eps)  // sqrt(v / bias_correction2) + eps
-                    ),  // m / bias_correction1 / (sqrt(v / bias_correction2) + eps)
-                    p
-                );  // p = p - lr * m / bias_correction1 / (sqrt(v / bias_correction2) + eps)
-                _mm256_storeu_ps(&param_fp32_ptr[j], p);
-                _mm_storeu_si128((__m128i*)&p_fp16_ptr[j], _mm256_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
-                _mm256_storeu_ps(&m_fp32_ptr[j], m);
-                _mm256_storeu_ps(&v_fp32_ptr[j], v);
-#endif
             }
         }
+        });
+}
+
 
-    });
+
+
+void adam_cpu_launcher(
+    int64_t n,
+    std::uintptr_t param_fp32,
+    std::uintptr_t param_fp16,
+    std::uintptr_t g_fp16,
+    std::uintptr_t m_fp32,
+    std::uintptr_t v_fp32,
+    float beta1, float beta2, 
+    float eps, float lr, 
+    float scale, 
+    float weight_decay,
+    float bias_correction1,
+    float bias_correction2
+) {
+    
+    auto param_fp32_ptr = reinterpret_cast<float*>(param_fp32);
+    auto m_fp32_ptr = reinterpret_cast<float*>(m_fp32);
+    auto v_fp32_ptr = reinterpret_cast<float*>(v_fp32);
+    auto param_fp16_ptr = reinterpret_cast<uint16_t*>(param_fp16);
+    auto g_fp16_ptr  = reinterpret_cast<uint16_t*>(g_fp16);
+    int cpu_level = get_cpu_level();
+    if (cpu_level == 0 ){
+        adam_cpu_0(n, param_fp32_ptr, param_fp16_ptr, g_fp16_ptr, m_fp32_ptr, v_fp32_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2);
+    }else if(cpu_level == 1){
+        adam_cpu_1(n, param_fp32_ptr, param_fp16_ptr, g_fp16_ptr, m_fp32_ptr, v_fp32_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2);
+    }else{
+        adam_cpu_2(n, param_fp32_ptr, param_fp16_ptr, g_fp16_ptr, m_fp32_ptr, v_fp32_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2);
+    }
 }
 
 
diff --git a/csrc/include/cpu_info.h b/csrc/include/cpu_info.h
new file mode 100644
index 00000000..53ed48f8
--- /dev/null
+++ b/csrc/include/cpu_info.h
@@ -0,0 +1,38 @@
+#include <cpuid.h>
+
+static void cpuid(int info[4], int InfoType){
+    __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]);
+}
+
+int get_cpu_level() {
+    //  SIMD: 128-bit
+    bool HW_F16C;
+
+    //  SIMD: 256-bit
+    bool HW_AVX;
+    bool HW_FMA;
+
+    //  SIMD: 512-bit
+    bool HW_AVX512F;    //  AVX512 Foundation
+
+    int info[4];
+    cpuid(info, 0);
+    int nIds = info[0];
+
+    //  Detect Features
+    if (nIds >= 0x00000001){
+        cpuid(info,0x00000001);
+        HW_AVX    = (info[2] & ((int)1 << 28)) != 0;
+        HW_FMA    = (info[2] & ((int)1 << 12)) != 0;
+        HW_F16C   = (info[2] & ((int)1 << 29)) != 0;
+    }
+    if (nIds >= 0x00000007){
+        cpuid(info,0x00000007);
+        HW_AVX512F     = (info[1] & ((int)1 << 16)) != 0;
+    }
+
+    int ret = 0;
+    if (HW_AVX && HW_FMA && HW_F16C) ret = 1;
+    if (HW_AVX512F) ret = 2;
+    return ret;
+}