axolotl-ai-cloud · bursteratom · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/docs/config.qmd b/docs/config.qmd
@@ -78,6 +78,9 @@ tf32: true # require >=ampere
 bfloat16: true # require >=ampere
 float16: true
 
+# Use Tensor parallel
+tensor_parallel: true # require multi-gGPU
+
 # Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
 gpu_memory_limit: 20GiB
 # Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge

diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
@@ -703,6 +703,9 @@ def build(self, total_num_steps):
                 "accelerator_config"
             ] = self.cfg.accelerator_config
 
+        if self.cfg.tensor_parallel:
+            training_arguments_kwargs["tp_size"] = torch.cuda.device_count()
+
         if self.cfg.kd_ce_alpha is not None:
             training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
         if self.cfg.kd_alpha is not None:

diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
@@ -748,6 +748,8 @@ class AxolotlInputConfig(
     local_rank: Optional[int] = None
     ddp: Optional[bool] = None
 
+    tensor_parallel: Optional[bool] = None
+
     seed: Optional[int] = None
     ddp_timeout: Optional[int] = None
     ddp_bucket_cap_mb: Optional[int] = None
@@ -1371,6 +1373,13 @@ def check_peft_layers_pattern(cls, data):
             )
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp_tp(cls, data):
+        if data.get("fsdp") and data.get("tensor_parallel"):
+            raise ValueError("FSDP with tensor parallelism is not supported yet.")
+        return data
+
     @model_validator(mode="after")
     def check_fft_possible_bad_config(self):
         if (

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
@@ -762,6 +762,9 @@ def _configure_zero3_memory_efficient_loading():
             return hf_ds_cfg
 
         skip_move_to_device = False
+        if self.cfg.tensor_parallel:
+            del self.model_kwargs["device_map"]
+
         if (  # pylint: disable=condition-evals-to-constant)
             (self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
             and not qlora_fsdp

diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
@@ -547,6 +547,7 @@ def prepare_optim_env(cfg):
     if not check_cuda_p2p_ib_support():
         if os.getenv("NCCL_P2P_DISABLE") is None:
             os.environ["NCCL_P2P_DISABLE"] = "1"
+
     if cfg.fsdp:
         setup_fsdp_envs(cfg)
     elif cfg.deepspeed: