Skip to content

Commit

Permalink
merge main into multi-backend-refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
Titus-von-Koeller committed Jul 26, 2024
2 parents 0859784 + 9b72679 commit 63f5872
Show file tree
Hide file tree
Showing 22 changed files with 267 additions and 80 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/build_documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ jobs:
with:
commit_sha: ${{ github.sha }}
package: bitsandbytes
repo_owner: TimDettmers
repo_owner: bitsandbytes-foundation
# avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
version_tag_suffix: '' # defaults to '/src'
custom_container: huggingface/transformers-doc-builder
secrets:
hf_token: ${{ secrets.HUGGINGFACE_PUSH }}
6 changes: 4 additions & 2 deletions .github/workflows/build_pr_documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ concurrency:

jobs:
build:
if: github.repository == 'TimDettmers/bitsandbytes'
if: github.repository == 'bitsandbytes-foundation/bitsandbytes'
uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
with:
commit_sha: ${{ github.event.pull_request.head.sha }}
pr_number: ${{ github.event.number }}
package: bitsandbytes
repo_owner: TimDettmers
repo_owner: bitsandbytes-foundation
# avoid /src suffix leading to wrong links, like bitsandbytes/blob/main/src/bitsandbytes/nn/
version_tag_suffix: '' # defaults to '/src'
custom_container: huggingface/transformers-doc-builder
6 changes: 2 additions & 4 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,10 @@ jobs:
os: [ubuntu-latest, windows-latest]
arch: [x86_64, aarch64]
cuda_version:
["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.0"]
["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.1", "12.5.0"]
exclude:
- os: windows-latest # This probably requires arm64 Windows agents
arch: aarch64
- os: windows-latest # The Jimver/cuda-toolkit is action used for Windows builds is not updated for 12.4 yet.
cuda_version: "12.4.0"
- os: ubuntu-latest # Temporary. Takes too long, not ready yet.
arch: aarch64
runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
Expand All @@ -79,7 +77,7 @@ jobs:
if: startsWith(matrix.os, 'ubuntu')
uses: docker/setup-qemu-action@v2
# Windows: We install Cuda on the agent (slow)
- uses: Jimver/cuda-toolkit@v0.2.14
- uses: Jimver/cuda-toolkit@v0.2.16
if: startsWith(matrix.os, 'windows')
id: cuda-toolkit
with:
Expand Down
26 changes: 26 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,29 @@
### 0.43.2

This release is quite significant as the QLoRA bug fix big implications for higher `seqlen` and batch sizes.

For each sequence (i.e. batch size increase of one) we expect memory savings of:
- 405B: 39GB for seqlen 1024, and 4888GB for 128k
- 70B: 20.1GB for 1024 and 2516GB for 128k

This was due to activations being unnecessary for frozen parameters, yet the memory for them was still erroneously allocated due to the now fixed bug.

#### Improvements:

- docs: FSDP+QLoRA and CPU install guide (#1211 #1227, thanks @stevhliu)
- Add CUDA 12.5 and update 12.4 builds (#1284)

#### Bug Fixes

- 4bit getstate and 8bit deepcopy (#1230 #1231, thanks @BenjaminBossan)
- missing optimizers in `str2optimizer32bit` (#1222, thanks @EtienneDosSantos)
- CUDA 12.5 build issue (#1273, thanks @HennerM)
- fix for min_8bit_size functionality in Optimizer base classes (#1286, thanks @Edenzzzz)
- QLoRA mem bug (#1270, thanks @Ther-nullptr)
- tests for cpu only platforms (#1259, thanks @galqiwi)
- restoration of quant_storage for CPU offloading (#1279)
- optim update error with non-contiguous grads/params (deepspeed) (#1187)

### 0.43.1

#### Improvements:
Expand Down
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,13 @@ endif()


if(BUILD_CUDA)
# NVCC normally will only work with MSVC up to 1939. VS2022 17.10+ starts using versions 1940+.
# Workaround: use --allow-unsupported-compiler
# This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
endif()

enable_language(CUDA) # This will fail if CUDA is not found
find_package(CUDAToolkit REQUIRED)

Expand Down Expand Up @@ -229,7 +236,6 @@ if(WIN32)
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()

# Weird MSVC hacks
if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
endif()
Expand Down
5 changes: 5 additions & 0 deletions _typos.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
[files]

[default]
extend-ignore-re = [
"@Ther-nul", # valid Github user
]

[default.extend-identifiers]

[type.py.extend-words]
Expand Down
2 changes: 1 addition & 1 deletion bitsandbytes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,4 @@
"optim.optimizer.MockArgs": False,
}

__version__ = "0.43.2.dev"
__version__ = "0.43.3.dev"
4 changes: 2 additions & 2 deletions bitsandbytes/autograd/_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,7 +524,7 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
ctx.dtype_A, ctx.dtype_B, ctx.dtype_bias = A.dtype, B.dtype, None if bias is None else bias.dtype

if any(ctx.needs_input_grad[:2]):
ctx.tensors = (A, B)
ctx.tensors = (None, B)
else:
ctx.tensors = (None, None)

Expand All @@ -537,7 +537,7 @@ def backward(ctx, grad_output):
return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, bias_grad, None

req_gradA, _, _, req_gradBias, _ = ctx.needs_input_grad
A, B = ctx.tensors
_, B = ctx.tensors

grad_A, grad_B, grad_bias = None, None, None

Expand Down
29 changes: 29 additions & 0 deletions bitsandbytes/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,35 @@ def prod(iterable):

if lib and lib.compiled_with_cuda:
"""C FUNCTIONS FOR OPTIMIZERS"""
str2optimizer32bit = {
"adam": (
lib.cadam32bit_grad_fp32,
lib.cadam32bit_grad_fp16,
lib.cadam32bit_grad_bf16,
),
"momentum": (
lib.cmomentum32bit_grad_32,
lib.cmomentum32bit_grad_16,
),
"rmsprop": (
lib.crmsprop32bit_grad_32,
lib.crmsprop32bit_grad_16,
),
"lion": (
lib.clion32bit_grad_fp32,
lib.clion32bit_grad_fp16,
lib.clion32bit_grad_bf16,
),
"adagrad": (
lib.cadagrad32bit_grad_32,
lib.cadagrad32bit_grad_16,
),
"lamb": (
lib.cadam32bit_grad_fp32,
lib.cadam32bit_grad_fp16,
),
}

str2optimizer8bit = {
"adam": (
lib.cadam_static_8bit_grad_32,
Expand Down
27 changes: 21 additions & 6 deletions bitsandbytes/nn/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def __new__(
return self

def __getstate__(self):
state = self.__dict__
state = self.__dict__.copy()
state["data"] = self.data
state["requires_grad"] = self.requires_grad
return state
Expand Down Expand Up @@ -286,6 +286,9 @@ def from_prequantized(
self.compress_statistics = self.quant_state.nested
self.quant_type = self.quant_state.quant_type
self.bnb_quantized = True

self.quant_storage = data.dtype

return self

def _quantize(self, device):
Expand Down Expand Up @@ -340,6 +343,7 @@ def to(self, *args, **kwargs):
blocksize=self.blocksize,
compress_statistics=self.compress_statistics,
quant_type=self.quant_type,
quant_storage=self.quant_storage,
)

return new_param
Expand Down Expand Up @@ -457,7 +461,7 @@ def forward(self, x: torch.Tensor):
# since we registered the module, we can recover the state here
assert self.weight.shape[1] == 1
if not isinstance(self.weight, Params4bit):
self.weight = Params4bit(self.weight, quant_storage=self.quant_storage)
self.weight = Params4bit(self.weight, quant_storage=self.quant_storage, bnb_quantized=True)
self.weight.quant_state = self.quant_state
else:
print(
Expand Down Expand Up @@ -567,13 +571,12 @@ def __new__(
CB=None,
SCB=None,
):
cls.has_fp16_weights = has_fp16_weights
cls.CB = None
cls.SCB = None
if data is None:
data = torch.empty(0)
obj = torch.Tensor._make_subclass(cls, data, requires_grad)
obj.CB, obj.SCB = cls.CB, cls.SCB
obj.CB = CB
obj.SCB = SCB
obj.has_fp16_weights = has_fp16_weights
return obj

def cuda(self, device):
Expand All @@ -592,6 +595,18 @@ def cuda(self, device):

return self

def __deepcopy__(self, memo):
# adjust this if new arguments are added to the constructor
new_instance = type(self).__new__(
type(self),
data=copy.deepcopy(self.data, memo),
requires_grad=self.requires_grad,
has_fp16_weights=self.has_fp16_weights,
CB=copy.deepcopy(self.CB, memo),
SCB=copy.deepcopy(self.SCB, memo),
)
return new_instance

def cpu(self):
# we store the 8-bit rows-major weight
B = self.data.contiguous().bfloat16().cpu()
Expand Down
12 changes: 10 additions & 2 deletions bitsandbytes/optim/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ def init_state(self, group, p, gindex, pindex):
state = self.state[p]
state["step"] = 0

if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
if dtype == torch.float32:
state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
state["state2"] = self.get_state_buffer(p, dtype=torch.float32)
elif dtype == torch.uint8:
Expand Down Expand Up @@ -474,6 +474,10 @@ def init_state(self, group, p, gindex, pindex):

@torch.no_grad()
def update_step(self, group, p, gindex, pindex):
# avoid update error from non-contiguous memory layout
p.data = p.data.contiguous()
p.grad = p.grad.contiguous()

state = self.state[p]
grad = p.grad

Expand Down Expand Up @@ -656,7 +660,7 @@ def init_state(self, group, p, gindex, pindex):
state = self.state[p]
state["step"] = 0

if dtype == torch.float32 or (dtype == torch.uint8 and p.numel() < 4096):
if dtype == torch.float32:
state["state1"] = self.get_state_buffer(p, dtype=torch.float32)
elif dtype == torch.uint8:
if state["step"] == 0:
Expand Down Expand Up @@ -685,6 +689,10 @@ def init_state(self, group, p, gindex, pindex):

@torch.no_grad()
def update_step(self, group, p, gindex, pindex):
# avoid update error from non-contiguous memory layout
p.data = p.data.contiguous()
p.grad = p.grad.contiguous()

state = self.state[p]
grad = p.grad

Expand Down
2 changes: 0 additions & 2 deletions csrc/kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
#include <cub/block/block_reduce.cuh>
#include <cub/cub.cuh>
#include <math_constants.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <mma.h>


Expand Down
4 changes: 0 additions & 4 deletions csrc/ops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,6 @@
#include <vector>
#include <functional>

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>



#define CUDA_CHECK_RETURN(value) { \
cudaError_t _m_cudaStat = value; \
Expand Down
Loading

0 comments on commit 63f5872

Please sign in to comment.