diff --git a/deepspeed/runtime/zero/stage2.py b/deepspeed/runtime/zero/stage2.py index a9216160f2c5..ad529aa96391 100755 --- a/deepspeed/runtime/zero/stage2.py +++ b/deepspeed/runtime/zero/stage2.py @@ -2000,3 +2000,106 @@ def _handle_overflow(cpu_sum, x, i): logger.info( f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}" ) + + +def estimate_zero2_model_states_mem_needs(total_params, + num_gpus_per_node=1, + num_nodes=1, + cpu_offload=True, + additional_buffer_factor=1.5): + + total_gpus = num_nodes * num_gpus_per_node + + if cpu_offload: + gpu_mem = 2 * total_params + cpu_mem = total_params * max(4 * total_gpus, 16) * additional_buffer_factor + else: + gpu_mem = 4 * total_params + int(16 * total_params / total_gpus) + cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor + + return int(cpu_mem), int(gpu_mem) + + +def model_to_params(model): + # shared params calculated only once + total_params = sum( + dict((p.data_ptr(), + p.numel()) for p in model.parameters()).values()) + return total_params + + +def estimate_zero2_model_states_mem_needs_all_live(model, + num_gpus_per_node=1, + num_nodes=1, + additional_buffer_factor=1.5): + """ + Print out estimates on memory usage requirements for ZeRO 2 params, optim states and gradients + for a given ``model`` and hardware setup. + + If you have an actual model object, use this function and everything will be derived + automatically. + + If it's a hypothetical model, use ``estimate_zero2_model_states_mem_needs_all_cold`` where you have to pass + the ``total_params`` explicitly. + + Args: + - ``model``: ``nn.Module`` object + - ``num_gpus_per_node``: how many gpus per node (defaults to 1) + - ``num_nodes``: how many nodes (defaults to 1), + - ``additional_buffer_factor``: estimation factor (defaults to 1.5): + + """ + + total_params = model_to_params(model) + + estimate_zero2_model_states_mem_needs_all_cold( + total_params=total_params, + num_gpus_per_node=num_gpus_per_node, + num_nodes=num_nodes, + additional_buffer_factor=additional_buffer_factor) + + +def estimate_zero2_model_states_mem_needs_all_cold(total_params, + num_gpus_per_node=1, + num_nodes=1, + additional_buffer_factor=1.5): + """ + Print out estimates on memory usage requirements for ZeRO 2 params, optim states and gradients + for a given ``model`` and hardware setup. + + If it's a hypothetical model, use this function where you have to pass + the ``total_params`` and ``largest_layer_params`` explicitly. + + If you have an actual model object, use ``estimate_zero2_model_states_mem_needs_all_live`` and everything + will be derived automatically. + + Args: + - ``total_params``: total model params + - ``num_gpus_per_node``: how many gpus per node (defaults to 1) + - ``num_nodes``: how many nodes (defaults to 1), + - ``additional_buffer_factor``: estimation factor (defaults to 1.5): + + """ + def format_options(cpu_offload): + enabled = [] + enabled.append(f"cpu_offload={1 if cpu_offload else 0}") + return ", ".join(enabled) + + nodes_str = "nodes" if num_nodes > 1 else "node" + gpus_str = "GPUs" if num_gpus_per_node > 1 else "GPU" + print( + "Estimated memory needed for params, optim states and gradients for a:\n" + f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n" + f"SW: Model with {int(total_params/1e6)}M total params.") + print(" per CPU | per GPU | Options") + for cpu_offload in [True, False]: + cpu_mem, gpu_mem = estimate_zero2_model_states_mem_needs( + total_params=total_params, + num_gpus_per_node=num_gpus_per_node, + num_nodes=num_nodes, + cpu_offload=cpu_offload, + additional_buffer_factor=additional_buffer_factor + ) + + options_str = format_options(cpu_offload=cpu_offload) + print(f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}") diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index 548c38a072c3..d492df42067d 100755 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -3258,3 +3258,155 @@ def _handle_overflow(cpu_sum, x, i): logger.info( f"rank {rank} detected overflow {cpu_sum} in tensor {i}:{t_i} shape {x.shape}" ) + + +def estimate_zero3_model_states_mem_needs(total_params, + largest_layer_params, + num_gpus_per_node=1, + num_nodes=1, + cpu_offload=True, + cpu_offload_params=True, + zero_init=True, + additional_buffer_factor=1.5): + + total_gpus = num_nodes * num_gpus_per_node + gpus_factor = 1 / num_nodes + largest_layer_memory = (4 * largest_layer_params) + + if cpu_offload: + if cpu_offload_params: + gpu_mem = largest_layer_memory + + if zero_init: + cpu_mem = total_params * 18 * gpus_factor * additional_buffer_factor + else: + cpu_mem = total_params * max(4 * num_gpus_per_node, + 18 * gpus_factor) * additional_buffer_factor + + else: + gpu_mem = largest_layer_memory + int(2 * total_params / total_gpus) + + if zero_init: + cpu_mem = total_params * 16 * gpus_factor * additional_buffer_factor + else: + cpu_mem = total_params * max(4 * num_gpus_per_node, + 16 * gpus_factor) * additional_buffer_factor + else: + gpu_mem = largest_layer_memory + int(18 * total_params / total_gpus) + if zero_init: + cpu_mem = largest_layer_params * 4 * num_gpus_per_node * additional_buffer_factor + else: + cpu_mem = total_params * 4 * num_gpus_per_node * additional_buffer_factor + + return int(cpu_mem), int(gpu_mem), largest_layer_memory + + +def model_to_params(model): + # shared params calculated only once + total_params = sum( + dict((p.data_ptr(), + p.numel()) for p in model.parameters()).values()) + + largest_layer_params = 0 + for m in model.modules(): + # assuming no shared params within a single layer + layer_params = sum(p.numel() for p in m.parameters(recurse=False)) + largest_layer_params = max(largest_layer_params, layer_params) + + return total_params, largest_layer_params + + +import math + + +def estimate_zero3_model_states_mem_needs_all_live(model, + num_gpus_per_node=1, + num_nodes=1, + additional_buffer_factor=1.5): + """ + Print out estimates on memory usage requirements for ZeRO 3 params, optim states and gradients + for a given ``model`` and hardware setup. + + If you have an actual model object, use this function and everything will be derived + automatically. + + If it's a hypothetical model, use ``estimate_zero3_model_states_mem_needs_all_cold`` where you have to pass + the ``total_params`` and ``largest_layer_params`` explicitly. + + Args: + - ``model``: ``nn.Module`` object + - ``num_gpus_per_node``: how many gpus per node (defaults to 1) + - ``num_nodes``: how many nodes (defaults to 1), + - ``additional_buffer_factor``: estimation factor (defaults to 1.5): + + """ + + total_params, largest_layer_params = model_to_params(model) + + estimate_zero3_model_states_mem_needs_all_cold( + total_params=total_params, + largest_layer_params=largest_layer_params, + num_gpus_per_node=num_gpus_per_node, + num_nodes=num_nodes, + additional_buffer_factor=additional_buffer_factor) + + +def estimate_zero3_model_states_mem_needs_all_cold(total_params, + largest_layer_params, + num_gpus_per_node=1, + num_nodes=1, + additional_buffer_factor=1.5): + """ + Print out estimates on memory usage requirements for ZeRO 3 params, optim states and gradients + for a given ``model`` and hardware setup. + + If it's a hypothetical model, use this function where you have to pass + the ``total_params`` and ``largest_layer_params`` explicitly. + + If you have an actual model object, use ``estimate_zero3_model_states_mem_needs_all_live`` and everything + will be derived automatically. + + Args: + - ``total_params``: total model params + - ``largest_layer_params``: largest layer's params + - ``num_gpus_per_node``: how many gpus per node (defaults to 1) + - ``num_nodes``: how many nodes (defaults to 1), + - ``additional_buffer_factor``: estimation factor (defaults to 1.5): + + """ + def format_options(cpu_offload, cpu_offload_params, zero_init): + enabled = [] + enabled.append(f"cpu_offload={1 if cpu_offload else 0}") + enabled.append(f"cpu_offload_params={1 if cpu_offload_params else 0}") + enabled.append(f"zero_init={1 if zero_init else 0}") + return ", ".join(enabled) + + nodes_str = "nodes" if num_nodes > 1 else "node" + gpus_str = "GPUs" if num_gpus_per_node > 1 else "GPU" + print( + "Estimated memory needed for params, optim states and gradients for a:\n" + f"HW: Setup with {num_nodes} {nodes_str}, {num_gpus_per_node} {gpus_str} per node.\n" + f"SW: Model with {int(total_params/1e6)}M total params, {int(largest_layer_params/1e6)}M largest layer params." + ) + print(" per CPU | per GPU | Options") + for cpu_offload in [True, False]: + for cpu_offload_params in [True, False]: + if not cpu_offload and cpu_offload_params: + continue + for zero_init in [True, False]: + cpu_mem, gpu_mem, largest_layer_memory = estimate_zero3_model_states_mem_needs( + total_params=total_params, + largest_layer_params=largest_layer_params, + num_gpus_per_node=num_gpus_per_node, + num_nodes=num_nodes, + cpu_offload=cpu_offload, + cpu_offload_params=cpu_offload_params, + zero_init=zero_init, + additional_buffer_factor=additional_buffer_factor + ) + + options_str = format_options(cpu_offload=cpu_offload, + cpu_offload_params=cpu_offload_params, + zero_init=zero_init) + print( + f" {cpu_mem/2**30:7.2f}GB | {gpu_mem/2**30:6.2f}GB | {options_str}") diff --git a/docs/code-docs/source/index.rst b/docs/code-docs/source/index.rst index 2f39da002d5e..5175209cc1c4 100644 --- a/docs/code-docs/source/index.rst +++ b/docs/code-docs/source/index.rst @@ -79,6 +79,14 @@ Flops Profiler flops-profiler + +Memory Usage +------------------ +.. toctree:: + :maxdepth: 2 + + memory + Indices and tables ------------------ diff --git a/docs/code-docs/source/memory.rst b/docs/code-docs/source/memory.rst new file mode 100644 index 000000000000..7f810309a80b --- /dev/null +++ b/docs/code-docs/source/memory.rst @@ -0,0 +1,288 @@ +Memory Requirements +----------------------- + + +API To Estimate Memory Usage +============================ + +ZeRO2: + +.. autofunction:: deepspeed.runtime.zero.stage2.estimate_zero2_model_states_mem_needs_all_live + +.. autofunction:: deepspeed.runtime.zero.stage2.estimate_zero2_model_states_mem_needs_all_cold + +Examples: + +Let's try a 3B model with just 1 node with 8 gpus, using live model: + +.. code-block:: bash + + python -c 'from transformers import AutoModel; \ + from deepspeed.runtime.zero.stage2 import estimate_zero2_model_states_mem_needs_all_live; \ + model = AutoModel.from_pretrained("t5-3b"); \ + estimate_zero2_model_states_mem_needs_all_live(model, num_gpus_per_node=8, num_nodes=1)' + Estimated memory needed for params, optim states and gradients for a: + HW: Setup with 1 node, 8 GPUs per node. + SW: Model with 2851M total params. + per CPU | per GPU | Options + 127.48GB | 5.31GB | cpu_offload=1 + 127.48GB | 15.93GB | cpu_offload=0 + +Now, without the actual model, which requires us to know ``total_params`` and +``largest_layer_params``, but we got those from the run above, so future estimators are now much +faster as we don't need to load the model. + +.. code-block:: bash + + python -c 'from deepspeed.runtime.zero.stage2 import estimate_zero2_model_states_mem_needs_all_cold; \ + estimate_zero2_model_states_mem_needs_all_cold(total_params=2851e6, num_gpus_per_node=8, num_nodes=1)' + Estimated memory needed for params, optim states and gradients for a: + HW: Setup with 1 node, 8 GPUs per node. + SW: Model with 2851M total params. + per CPU | per GPU | Options + 127.45GB | 5.31GB | cpu_offload=1 + 127.45GB | 15.93GB | cpu_offload=0 + +There is a slight difference due to rounding - the actual live model has a few more params + + +ZeRO3: + +.. autofunction:: deepspeed.runtime.zero.stage3.estimate_zero3_model_states_mem_needs_all_live + +.. autofunction:: deepspeed.runtime.zero.stage3.estimate_zero3_model_states_mem_needs_all_cold + +Examples: + +Let's try a 3B model with just 1 node with 8 gpus, using live model: + +.. code-block:: bash + + python -c 'from transformers import AutoModel; \ + from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \ + model = AutoModel.from_pretrained("t5-3b"); \ + estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=8, num_nodes=1)' + + Estimated memory needed for params, optim states and gradients for a: + HW: Setup with 1 node, 8 GPUs per node. + SW: Model with 2851M total params, 32M largest layer params. + per CPU | per GPU | Options + 71.71GB | 0.12GB | cpu_offload=1, cpu_offload_params=1, zero_init=1 + 127.48GB | 0.12GB | cpu_offload=1, cpu_offload_params=1, zero_init=0 + 63.74GB | 0.79GB | cpu_offload=1, cpu_offload_params=0, zero_init=1 + 127.48GB | 0.79GB | cpu_offload=1, cpu_offload_params=0, zero_init=0 + 1.47GB | 6.10GB | cpu_offload=0, cpu_offload_params=0, zero_init=1 + 127.48GB | 6.10GB | cpu_offload=0, cpu_offload_params=0, zero_init=0 + +Now, without the actual model, which requires us to know ``total_params`` and +``largest_layer_params``, but we got those from the run above, so future estimators are now much +faster as we don't need to load the model. + +.. code-block:: bash + + python -c 'from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_cold; \ + estimate_zero3_model_states_mem_needs_all_cold(total_params=2851e6, largest_layer_params=32e6, num_gpus_per_node=8, num_nodes=1)' + + Estimated memory needed for params, optim states and gradients for a: + HW: Setup with 1 node, 8 GPUs per node. + SW: Model with 2851M total params, 32M largest layer params. + per CPU | per GPU | Options + 71.69GB | 0.12GB | cpu_offload=1, cpu_offload_params=1, zero_init=1 + 127.45GB | 0.12GB | cpu_offload=1, cpu_offload_params=1, zero_init=0 + 63.72GB | 0.78GB | cpu_offload=1, cpu_offload_params=0, zero_init=1 + 127.45GB | 0.78GB | cpu_offload=1, cpu_offload_params=0, zero_init=0 + 1.43GB | 6.09GB | cpu_offload=0, cpu_offload_params=0, zero_init=1 + 127.45GB | 6.09GB | cpu_offload=0, cpu_offload_params=0, zero_init=0 + +There is a slight difference due to rounding - the actual live model has a few more params + + + +Discussion +========== + +Let's look in detail how the memory estimator API calculates these numbers and also discuss some additional numbers that aren't covered by the API. + +In the following discussion: + +- ``params`` - total number of model params, which can be calculated as: + +.. code-block:: python + + print(sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())) + +Some models already include the number of params in the model name, e.g. t5-11b (11B params), gpt-neo-1.3B (1.3B params), etc. + +Also if the model weights are stored in ``fp32`` the other quick way to calculate the size of the model is to simply divide the size of the ``state_dict`` file by 4 (fp32 == 4 bytes). For example, you can see that `t5-11b's pytorch_model.bin `__ is 42.1GB in size, so if we divide it by 4, we can immediately tell it's an 11B model. + +The following calculations show how much memory is required by model params, gradients and optimizer states. In addition to those you will need enough memory to fit activation calculations and any temporary memory for intermediate calculations, which for long sequences could be very significant (e.g. could take the same amount of memory as params+grads+optim_states combined). + +The optimizer states assume that ``Adam`` is used, where 4 bytes per parameter are used by momentum and another 4 by variance (8 in total). + +Gradients at ``fp32`` take 4 bytes, and parameters take 2 bytes at ``fp16` and 4 bytes at ``fp32``. + +**GPU RAM** + +The big question is how big of a model you can fit on the hardware you have? Or rather what size of a GPU RAM do you need to fit the desired model. + + +* ZeRO-2: + + - ``"cpu_offload": true``: 2 * params + + Example: a 40GB GPU can fit ~11B param model (regardless of how many GPUs are used). Here the model is loaded in ``fp16`` so just the model weights take about 22GB and the remaining 18GB are used by other components. You can barely fit a very small batch size in this scenario. + + - ``"cpu_offload": false``: 4 params + 16 params/ (total number of gpus) + +* ZeRO-3: + +largest_layer_memory = 4*largest_layer_params - GPU memory needed to gather the largest layer on a single GPU. 2 bytes fp16 params are gathered and 2 bytes fp16 grads are computed (total 4x). The optimizer states and fp32 parameters are updated in partitioned form and copied to fp16 params in partitioned form. This happens during the optimizer step. After that the fp16 params are sufficient. + + - case 1: ``"cpu_offload": false, "cpu_offload_params": false`` - largest_layer_memory + 18 * params / total number of gpus across all nodes + - case 2: ``"cpu_offload": true, "cpu_offload_params": true``- largest_layer_memory. The main limit here is general RAM. + - case 3: ``"cpu_offload": true, "cpu_offload_params": false``- largest_layer_memory + 2 * params / total number of gpus across all nodes + + Example: + +.. code-block:: python + +from transformers import AutoModel +model = AutoModel.from_pretrained("t5-large") + +# shared params calculated only ones +total_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values()) + +largest_layer_params = 0 +for m in model.modules(): + # assuming no shared params within a single layer + layer_params = sum(p.numel() for p in m.parameters(recurse=False)) + largest_layer_params = max(largest_layer_params, layer_params) + +largest_layer_memory = (4*largest_layer_params) + +total_gpus = 4 + +case1 = largest_layer_memory + int(18*total_params/total_gpus) +case2 = largest_layer_memory +case3 = largest_layer_memory + int(2*total_params/total_gpus) + +print(f"total params: {total_params/1e6:6.2f}M") +print(f"largest layer params: {largest_layer_params/1e6:6.2f}M") +print(f"largest layer memory: {largest_layer_memory>>20:6}MB") +print(f"case1 gpu memory: {(case1)>>20:6}MB") +print(f"case2 gpu memory: {(case2)>>20:6}MB") +print(f"case3 gpu memory: {(case3)>>20:6}MB") + +total params: 737.67M +largest layer params: 32.90M +largest layer memory: 125MB +case1 gpu memory: 3291MB +case2 gpu memory: 125MB +case3 gpu memory: 477MB + + +**General RAM**: + +One of the key features of ZeRO is its CPU offload which can dramatically extend the total memory pool accessible to the project by using general RAM. One can easily expand their general RAM by 10x times, at a significantly lower cost than what it'd take to have the same GPU RAM. And often, it's not even possible to buy GPUs with a lot of RAM (112GB GPU anybody?) since they simply don't yet exist. + +In the following calculations we will use: + +- ``additional_buffer_factor=1.5`` as an additional buffer factor to be conservative +- ``n_gpus`` the number of GPUs on a single node (machine) +- ``total_gpus`` the total number of GPUs across all nodes +- ``params`` - total number of model params (see above for how to get this number) + +* ZeRO-2: + + - ``"cpu_offload": false``: + + params * 4 * n_gpus * additional_buffer_factor - this is the memory needed only at the beginning to initialize the model on CPU memory + + - ``"cpu_offload": true``: + + params * max(4 * n_gpus, 16) * additional_buffer_factor + + Example: xxx + +* ZeRO-3: + + gpus_factor = n_gpus / total_gpus + + - case 1: ``"cpu_offload": false``: + + Without ``zero.Init``: + + params * 4 * n_gpus * additional_buffer_factor + + this is the memory needed only at the beginning to initialize the model on CPU memory. Once the model is transferred to GPUs this memory is freed. + + With ``zero.Init``: + + largest_layer_params * 4 * n_gpus * additional_buffer_factor + + assuming Pytorch is deallocating the memory once the tensors are moved to the GPU by ZeRO.Init + + - case 2: ``"cpu_offload": true, cpu_offload_params true``: + + Without ``zero.Init``: + + params * max(4 * n_gpus, 18 * gpus_factor) * additional_buffer_factor + + With ``zero.Init``: + + params * 18 * gpus_factor * additional_buffer_factor + + - case 3: ``"cpu_offload": true, cpu_offload_params false``: + + Without ``zero.Init``: + + params * max(4 * n_gpus, 16 * gpus_factor) * additional_buffer_factor + + With ``zero.Init``: + + params * 16 * gpus_factor * additional_buffer_factor + + +Here is a breakdown for the 16 and 18 multipliers (b = bytes): + +4 (in ``4*n_gpus``): + +- when pytorch creates a model it creates it in fp32 by default (4 bytes) + +16: + +- 16b for fp32: 4b params, 4b grads, 4b momentum and 4b variance per parameter + +18: + +- 16b for fp32: 4b params, 4b grads, 4b momentum and 4b variance per parameter +- +2b for fp16 params + + +**Pinned Memory** + +Pinned general RAM is included in normal general RAM allocations (i.e. this is not extra memory allocations but simply shows how much of the general RAM is pinned) + +* ZeRO-2: can't be controlled + +* ZeRO-3 + +To enable add: ``"cpu_offload_use_pin_memory" : true`` + +Now there are 2 sub-cases: + +1. ``"cpu_offload_params": true``: + + - 6 * params (2b for fp16 params + 4b for fp32 gradients) + - if ``gradient_accumulation_steps > 1`` an additional 2b for fp16 gradients are pinned + +2. ``"cpu_offload_params": false``: + + - 4b for fp32 gradients + + +**Activation Memory** + +XXX: For Transformers is probably around (2* seq * attn_heads + 16 * hidden_size) * sequence * batch/gpu + +This needs to be completed.