OpenBMB · a710128 · Mar 30, 2022 · Mar 30, 2022
diff --git a/bmtrain/block_layer.py b/bmtrain/block_layer.py
@@ -399,7 +399,12 @@ def __init__(self, inner_module : torch.nn.Module):
  to_offset_end = offset_end + param_st - storage_st
 
  # copy to buffer
- self._storage_params[kw_name].storage()[to_offset_st: to_offset_end].copy_(contiguous_param.storage()[offset_st: offset_end])
+ # PyTorch 1.11 changed the API of storage.__getitem__
+ d_dtype = self._storage_params[kw_name].dtype
+ d_device = self._storage_params[kw_name].device
+ torch.tensor([], dtype=d_dtype, device=d_device).set_(self._storage_params[kw_name].storage(), to_offset_st, (to_offset_end - to_offset_st,))[:] = \
+ torch.tensor([], dtype=d_dtype, device=d_device).set_(contiguous_param.storage(), offset_st, (offset_end - offset_st,))[:]
+ # self._storage_params[kw_name].storage()[to_offset_st: to_offset_end].copy_(contiguous_param.storage()[offset_st: offset_end])
  del contiguous_param
 
  # clear parameter data, but keep the dtype and device
@@ -472,7 +477,12 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
  to_offset_end = offset_end + param_st - storage_st
 
  # copy to buffer
- self._storage_params[kw_name].storage()[to_offset_st: to_offset_end].copy_(contiguous_param.storage()[offset_st: offset_end])
+ # PyTorch 1.11 changed the API of storage.__getitem__
+ d_dtype = self._storage_params[kw_name].dtype
+ d_device = self._storage_params[kw_name].device
+ torch.tensor([], dtype=d_dtype, device=d_device).set_(self._storage_params[kw_name].storage(), to_offset_st, (to_offset_end - to_offset_st,))[:] = \
+ torch.tensor([], dtype=d_dtype, device=d_device).set_(contiguous_param.storage(), offset_st, (offset_end - offset_st,))[:]
+ # self._storage_params[kw_name].storage()[to_offset_st: to_offset_end].copy_(contiguous_param.storage()[offset_st: offset_end])
  del contiguous_param
  elif strict:
  missing_keys.append(key)
@@ -527,7 +537,12 @@ def init_parameters(self):
  to_offset_end = offset_end + param_st - storage_st
 
  # copy to buffer
- self._storage_params[kw_name].storage()[to_offset_st: to_offset_end].copy_(tmp_tensor.storage()[offset_st: offset_end])
+ # PyTorch 1.11 changed the API of storage.__getitem__
+ d_dtype = self._storage_params[kw_name].dtype
+ d_device = self._storage_params[kw_name].device
+ torch.tensor([], dtype=d_dtype, device=d_device).set_(self._storage_params[kw_name].storage(), to_offset_st, (to_offset_end - to_offset_st,))[:] = \
+ torch.tensor([], dtype=d_dtype, device=d_device).set_(tmp_tensor.storage(), offset_st, (offset_end - offset_st,))[:]
+ # self._storage_params[kw_name].storage()[to_offset_st: to_offset_end].copy_(tmp_tensor.storage()[offset_st: offset_end])
  del tmp_tensor
 
  def _named_members(self, get_members_fn, prefix='', recurse=True):

diff --git a/bmtrain/param_init.py b/bmtrain/param_init.py
@@ -22,7 +22,9 @@ def init_distributed_parameter(params : Iterable[torch.nn.Parameter]):
 
  param._init_method(tmp_tensor)
 
- param.storage().copy_(tmp_storage[partition_size * config['rank'] : partition_size * (config['rank'] + 1)])
+ # Pytorch 1.11 changed the API of storage.__getitem__
+ param[:] = torch.tensor([], dtype=param.dtype, device=param.device).set_(tmp_storage)[partition_size * config['rank'] : partition_size * (config['rank'] + 1)]
+ # param.storage().copy_(tmp_storage[partition_size * config['rank'] : partition_size * (config['rank'] + 1)])
 
 def iterate_parameters(model : torch.nn.Module):
  for kw, val in model._parameters.items():

diff --git a/setup.py b/setup.py
@@ -11,16 +11,39 @@ def get_avx_flags():
  else:
  return ["-march=native"]
 
+def get_device_cc():
+ try:
+ CC_SET = set()
+ for i in range(torch.cuda.device_count()):
+ CC_SET.add(torch.cuda.get_device_capability(i))
+
+ if len(CC_SET) == 0:
+ return None
+
+ ret = ""
+ for it in CC_SET:
+ if len(ret) > 0:
+ ret = ret + " "
+ ret = ret + ("%d.%d" % it)
+ return ret
+ except RuntimeError:
+ return None
 
 avx_flag = get_avx_flags()
-
-if not torch.cuda.is_available():
- os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0+PTX")
-else:
- if torch.version.cuda.startswith("10"):
- os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5+PTX")
+device_cc = get_device_cc()
+if device_cc is None:
+ if not torch.cuda.is_available():
+ os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0+PTX")
  else:
- os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0 8.6+PTX")
+ if torch.version.cuda.startswith("10"):
+ os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5+PTX")
+ else:
+ if not torch.version.cuda.startswith("11.0"):
+ os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0 8.6+PTX")
+ else:
+ os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0+PTX")
+else:
+ os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", device_cc)
 
 setup(
  name='bmtrain',