feat: Support LoraConfig in TorchTune BuiltinTrainer (#102)

Electronic-Waste · web-flow · commit 80f6b0e1c904 · 2025-10-15T14:28:10.000Z
* feat: Add lora types.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* chore: propagate lora parameters in command.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* feat(lora): Add support for QLoRA.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(lora): remove extra quote symbol in lora attn module.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(lora): replace direct field override with field map.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(lora): remove extra flags.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(lora): fix wrong default list value in LoraConfig.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(lora): rmeove outdated code.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* test(backend): Add test for lora.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

---------

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;
diff --git a/kubeflow/trainer/__init__.py b/kubeflow/trainer/__init__.py
@@ -32,6 +32,7 @@
     HuggingFaceDatasetInitializer,
     HuggingFaceModelInitializer,
     Initializer,
+    LoraConfig,
     Loss,
     Runtime,
     RuntimeTrainer,
@@ -49,6 +50,7 @@
     "HuggingFaceDatasetInitializer",
     "HuggingFaceModelInitializer",
     "Initializer",
+    "LoraConfig",
     "Loss",
     "MODEL_PATH",
     "Runtime",
diff --git a/kubeflow/trainer/backends/kubernetes/backend_test.py b/kubeflow/trainer/backends/kubernetes/backend_test.py
@@ -238,12 +238,14 @@ def get_custom_trainer(
     )
 
 
-def get_builtin_trainer() -> models.TrainerV1alpha1Trainer:
+def get_builtin_trainer(
+    args: list[str],
+) -> models.TrainerV1alpha1Trainer:
     """
     Get the builtin trainer for the TrainJob.
     """
     return models.TrainerV1alpha1Trainer(
-        args=["batch_size=2", "epochs=2", "loss=Loss.CEWithChunkedOutputLoss"],
+        args=args,
         command=["tune", "run"],
         numNodes=2,
     )
@@ -707,7 +709,40 @@ def test_get_runtime_packages(kubernetes_backend, test_case):
             expected_output=get_train_job(
                 runtime_name=TORCH_TUNE_RUNTIME,
                 train_job_name=TRAIN_JOB_WITH_BUILT_IN_TRAINER,
-                train_job_trainer=get_builtin_trainer(),
+                train_job_trainer=get_builtin_trainer(
+                    args=["batch_size=2", "epochs=2", "loss=Loss.CEWithChunkedOutputLoss"],
+                ),
+            ),
+        ),
+        TestCase(
+            name="valid flow with built in trainer and lora config",
+            expected_status=SUCCESS,
+            config={
+                "trainer": types.BuiltinTrainer(
+                    config=types.TorchTuneConfig(
+                        num_nodes=2,
+                        peft_config=types.LoraConfig(
+                            apply_lora_to_mlp=True,
+                            lora_rank=8,
+                            lora_alpha=16,
+                            lora_dropout=0.1,
+                        ),
+                    ),
+                ),
+                "runtime": TORCH_TUNE_RUNTIME,
+            },
+            expected_output=get_train_job(
+                runtime_name=TORCH_TUNE_RUNTIME,
+                train_job_name=TRAIN_JOB_WITH_BUILT_IN_TRAINER,
+                train_job_trainer=get_builtin_trainer(
+                    args=[
+                        "model.apply_lora_to_mlp=True",
+                        "model.lora_rank=8",
+                        "model.lora_alpha=16",
+                        "model.lora_dropout=0.1",
+                        "model.lora_attn_modules=[q_proj,v_proj,output_proj]",
+                    ],
+                ),
             ),
         ),
         TestCase(
diff --git a/kubeflow/trainer/types/types.py b/kubeflow/trainer/types/types.py
@@ -110,6 +110,44 @@ class TorchTuneInstructDataset:
     column_map: Optional[dict[str, str]] = None
 
 
+@dataclass
+class LoraConfig:
+    """Configuration for the LoRA/QLoRA/DoRA.
+    REF: https://meta-pytorch.org/torchtune/main/tutorials/memory_optimizations.html
+
+    Args:
+        apply_lora_to_mlp (`Optional[bool]`):
+            Whether to apply LoRA to the MLP in each transformer layer.
+        apply_lora_to_output (`Optional[bool]`):
+            Whether to apply LoRA to the model's final output projection.
+        lora_attn_modules (`list[str]`):
+            A list of strings specifying which layers of the model to apply LoRA,
+            default is ["q_proj", "v_proj", "output_proj"]:
+            1. "q_proj" applies LoRA to the query projection layer.
+            2. "k_proj" applies LoRA to the key projection layer.
+            3. "v_proj" applies LoRA to the value projection layer.
+            4. "output_proj" applies LoRA to the attention output projection layer.
+        lora_rank (`Optional[int]`): The rank of the low rank decomposition.
+        lora_alpha (`Optional[int]`):
+            The scaling factor that adjusts the magnitude of the low-rank matrices' output.
+        lora_dropout (`Optional[float]`):
+            The probability of applying Dropout to the low rank updates.
+        quantize_base (`Optional[bool]`): Whether to enable model quantization.
+        use_dora (`Optional[bool]`): Whether to enable DoRA.
+    """
+
+    apply_lora_to_mlp: Optional[bool] = None
+    apply_lora_to_output: Optional[bool] = None
+    lora_attn_modules: list[str] = field(
+        default_factory=lambda: ["q_proj", "v_proj", "output_proj"]
+    )
+    lora_rank: Optional[int] = None
+    lora_alpha: Optional[int] = None
+    lora_dropout: Optional[float] = None
+    quantize_base: Optional[bool] = None
+    use_dora: Optional[bool] = None
+
+
 # Configuration for the TorchTune LLM Trainer.
 @dataclass
 class TorchTuneConfig:
@@ -127,6 +165,9 @@ class TorchTuneConfig:
         loss (`Optional[Loss]`): The loss algorithm we use to fine-tune the LLM,
             e.g. `torchtune.modules.loss.CEWithChunkedOutputLoss`.
         num_nodes (`Optional[int]`): The number of nodes to use for training.
+        peft_config (`Optional[LoraConfig]`):
+            Configuration for the PEFT(Parameter-Efficient Fine-Tuning),
+            including LoRA/QLoRA/DoRA, etc.
         dataset_preprocess_config (`Optional[TorchTuneInstructDataset]`):
             Configuration for the dataset preprocessing.
         resources_per_node (`Optional[Dict]`): The computing resources to allocate per node.
@@ -137,6 +178,7 @@ class TorchTuneConfig:
     epochs: Optional[int] = None
     loss: Optional[Loss] = None
     num_nodes: Optional[int] = None
+    peft_config: Optional[LoraConfig] = None
     dataset_preprocess_config: Optional[TorchTuneInstructDataset] = None
     resources_per_node: Optional[dict] = None
 
diff --git a/kubeflow/trainer/utils/utils.py b/kubeflow/trainer/utils/utils.py
@@ -477,13 +477,50 @@ def get_args_using_torchtune_config(
         else:
             args.append(f"dataset.data_dir={os.path.join(constants.DATASET_PATH, relative_path)}")
 
+    if fine_tuning_config.peft_config:
+        args += get_args_from_peft_config(fine_tuning_config.peft_config)
+
     if fine_tuning_config.dataset_preprocess_config:
-        args += get_args_in_dataset_preprocess_config(fine_tuning_config.dataset_preprocess_config)
+        args += get_args_from_dataset_preprocess_config(
+            fine_tuning_config.dataset_preprocess_config
+        )
+
+    return args
+
+
+def get_args_from_peft_config(peft_config: types.LoraConfig) -> list[str]:
+    """
+    Get the args from the given PEFT config.
+    """
+    args = []
+
+    if not isinstance(peft_config, types.LoraConfig):
+        raise ValueError(f"Invalid PEFT config type: {type(peft_config)}.")
+
+    field_map = {
+        "apply_lora_to_mlp": "model.apply_lora_to_mlp",
+        "apply_lora_to_output": "model.apply_lora_to_output",
+        "lora_rank": "model.lora_rank",
+        "lora_alpha": "model.lora_alpha",
+        "lora_dropout": "model.lora_dropout",
+        "quantize_base": "model.quantize_base",
+        "use_dora": "model.use_dora",
+    }
+
+    # Override the PEFT fields if they are provided.
+    for field, arg_name in field_map.items():
+        value = getattr(peft_config, field, None)
+        if value:
+            args.append(f"{arg_name}={value}")
+
+    # Override the LoRA attention modules if they are provided.
+    if peft_config.lora_attn_modules:
+        args.append(f"model.lora_attn_modules=[{','.join(peft_config.lora_attn_modules)}]")
 
     return args
 
 
-def get_args_in_dataset_preprocess_config(
+def get_args_from_dataset_preprocess_config(
     dataset_preprocess_config: types.TorchTuneInstructDataset,
 ) -> list[str]:
     """