KEP-2401: Support mutating dataset preprocessing config in SDK (kubeflow/trainer#2638)

Electronic-Waste · web-flow · commit b890a2a69ca3 · 2025-06-04T18:08:15.000Z
* feat(sdk): Add InstructDataset and dataset_preprocess_config.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(doc): update dataset preprocessing API definition in KEP.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* chore(sdk): Add get_args_in_dataset_preprocess_config func.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(sdk): Add the prefix path for dataset.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* chore(manifests): Load local datasets.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(sdk): Add TorchTune prefix to dataset class.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* chore(initializer): Update HF dataset initializer to support data_dir and data_files.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(sdk): remove data_files and data_dir definition.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(sdk): extract data_files and data_dir from storage_uri.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(sdk): fix errors in UT.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(manifest): Update dataset.data_dir in torchtune CTRs.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

* fix(sdk): fix dataset_uri.

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;

---------

Signed-off-by: Electronic-Waste &lt;2690692950@qq.com&gt;
diff --git a/kubeflow/trainer/__init__.py b/kubeflow/trainer/__init__.py
@@ -27,11 +27,13 @@
 from kubeflow.trainer.types.types import (
     BuiltinTrainer,
     CustomTrainer,
+    DataFormat,
     DataType,
     Framework,
     HuggingFaceDatasetInitializer,
     HuggingFaceModelInitializer,
     Initializer,
+    TorchTuneInstructDataset,
     Loss,
     Runtime,
     Trainer,
diff --git a/kubeflow/trainer/api/trainer_client.py b/kubeflow/trainer/api/trainer_client.py
@@ -193,7 +193,9 @@ def train(
 
             # If users choose to use a builtin trainer for post-training.
             elif isinstance(trainer, types.BuiltinTrainer):
-                trainer_crd = utils.get_trainer_crd_from_builtin_trainer(trainer)
+                trainer_crd = utils.get_trainer_crd_from_builtin_trainer(
+                    trainer, initializer
+                )
 
             else:
                 raise ValueError(
diff --git a/kubeflow/trainer/constants/constants.py b/kubeflow/trainer/constants/constants.py
@@ -122,3 +122,6 @@
 
 # The default entrypoint for mpirun.
 MPI_ENTRYPOINT = "mpirun"
+
+# The Instruct Datasets class in torchtune
+TORCHTUNE_INSTRUCT_DATASET = "torchtune.datasets.instruct_dataset"
diff --git a/kubeflow/trainer/types/types.py b/kubeflow/trainer/types/types.py
@@ -61,6 +61,49 @@ class DataType(Enum):
     FP32 = "fp32"
 
 
+# Data file type for the TorchTune LLM Trainer.
+class DataFormat(Enum):
+    """Data file type for the TorchTune LLM Trainer."""
+
+    JSON = "json"
+    CSV = "csv"
+    PARQUET = "parquet"
+    ARROW = "arrow"
+    TEXT = "text"
+    XML = "xml"
+
+
+# Configuration for the TorchTune Instruct dataset.
+@dataclass
+class TorchTuneInstructDataset:
+    """
+    Configuration for the custom dataset with user instruction prompts and model responses.
+    REF: https://pytorch.org/torchtune/main/generated/torchtune.datasets.instruct_dataset.html
+
+    Args:
+        source (`Optional[DataFormat]`): Data file type.
+        split (`Optional[str]`):
+            The split of the dataset to use.  You can use this argument to load a subset of
+            a given split, e.g. split="train[:10%]". Default is `train`.
+        train_on_input (`Optional[bool]`):
+            Whether the model is trained on the user prompt or not. Default is False.
+        new_system_prompt (`Optional[str]`):
+            The new system prompt to use. If specified, prepend a system message.
+            This can serve as instructions to guide the model response. Default is None.
+        column_map (`Optional[Dict[str, str]]`):
+            A mapping to change the expected "input" and "output" column names to the actual
+            column names in the dataset. Keys should be "input" and "output" and values should
+            be the actual column names. Default is None, keeping the default "input" and
+            "output" column names.
+    """
+
+    source: Optional[DataFormat] = None
+    split: Optional[str] = None
+    train_on_input: Optional[bool] = None
+    new_system_prompt: Optional[str] = None
+    column_map: Optional[Dict[str, str]] = None
+
+
 # Configuration for the TorchTune LLM Trainer.
 @dataclass
 class TorchTuneConfig:
@@ -78,6 +121,8 @@ class TorchTuneConfig:
         loss (`Optional[Loss]`): The loss algorithm we use to fine-tune the LLM,
             e.g. `torchtune.modules.loss.CEWithChunkedOutputLoss`.
         num_nodes (`Optional[int]`): The number of nodes to use for training.
+        dataset_preprocess_config (`Optional[TorchTuneInstructDataset]`):
+            Configuration for the dataset preprocessing.
         resources_per_node (`Optional[Dict]`): The computing resources to allocate per node.
     """
 
@@ -86,6 +131,7 @@ class TorchTuneConfig:
     epochs: Optional[int] = None
     loss: Optional[Loss] = None
     num_nodes: Optional[int] = None
+    dataset_preprocess_config: Optional[TorchTuneInstructDataset] = None
     resources_per_node: Optional[Dict] = None
 
 
diff --git a/kubeflow/trainer/utils/utils.py b/kubeflow/trainer/utils/utils.py
@@ -15,9 +15,11 @@
 import inspect
 import os
 import queue
+import re
 import textwrap
 import threading
 from typing import Any, Callable, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
 
 import kubeflow.trainer.models as models
 from kubeflow.trainer.constants import constants
@@ -327,6 +329,7 @@ def get_entrypoint_using_train_func(
 
 def get_args_using_torchtune_config(
     fine_tuning_config: types.TorchTuneConfig,
+    initializer: Optional[types.Initializer] = None,
 ) -> Tuple[List[str], List[str]]:
     """
     Get the Trainer args from the TorchTuneConfig.
@@ -352,6 +355,32 @@ def get_args_using_torchtune_config(
     if fine_tuning_config.loss:
         args.append(f"loss={fine_tuning_config.loss}")
 
+    # Override the data dir or data files if it is provided.
+    if isinstance(initializer, types.Initializer) and isinstance(
+        initializer.dataset, types.HuggingFaceDatasetInitializer
+    ):
+        storage_uri = (
+            "hf://" + initializer.dataset.storage_uri
+            if not initializer.dataset.storage_uri.startswith("hf://")
+            else initializer.dataset.storage_uri
+        )
+        storage_uri_parsed = urlparse(storage_uri)
+        relative_path = re.sub(r"^/[^/]+", "", storage_uri_parsed.path)
+
+        if "." in relative_path:
+            args.append(
+                f"dataset.data_files={os.path.join(constants.DATASET_PATH, relative_path)}"
+            )
+        else:
+            args.append(
+                f"dataset.data_dir={os.path.join(constants.DATASET_PATH, relative_path)}"
+            )
+
+    if fine_tuning_config.dataset_preprocess_config:
+        args += get_args_in_dataset_preprocess_config(
+            fine_tuning_config.dataset_preprocess_config
+        )
+
     return constants.DEFAULT_TORCHTUNE_COMMAND, args
 
 
@@ -390,6 +419,7 @@ def get_trainer_crd_from_custom_trainer(
 
 def get_trainer_crd_from_builtin_trainer(
     trainer: types.BuiltinTrainer,
+    initializer: Optional[types.Initializer] = None,
 ) -> models.TrainerV1alpha1Trainer:
     """
     Get the Trainer CRD from the builtin trainer.
@@ -413,7 +443,7 @@ def get_trainer_crd_from_builtin_trainer(
     # the torchtune config in the runtime plugin.
     # Ref:https://github.com/kubeflow/trainer/tree/master/docs/proposals/2401-llm-trainer-v2
     trainer_crd.command, trainer_crd.args = get_args_using_torchtune_config(
-        trainer.config
+        trainer.config, initializer
     )
 
     return trainer_crd
@@ -507,3 +537,53 @@ def get_log_queue_pool(log_streams: List[Any]) -> List[queue.Queue]:
         pool.append(q)
         threading.Thread(target=wrap_log_stream, args=(q, log_stream)).start()
     return pool
+
+
+def get_args_in_dataset_preprocess_config(
+    dataset_preprocess_config: types.TorchTuneInstructDataset,
+) -> List[str]:
+    """
+    Get the args from the given dataset preprocess config.
+    """
+    args = []
+
+    if not isinstance(dataset_preprocess_config, types.TorchTuneInstructDataset):
+        raise ValueError(
+            f"Invalid dataset preprocess config type: {type(dataset_preprocess_config)}."
+        )
+
+    # Override the dataset type field in the torchtune config.
+    args.append(f"dataset={constants.TORCHTUNE_INSTRUCT_DATASET}")
+
+    # Override the dataset source field if it is provided.
+    if dataset_preprocess_config.source:
+        if not isinstance(dataset_preprocess_config.source, types.DataFormat):
+            raise ValueError(
+                f"Invalid data format: {dataset_preprocess_config.source}."
+            )
+
+        args.append(f"dataset.source={dataset_preprocess_config.source}")
+
+    # Override the data dir or data files if it is provided.
+
+    # Override the split field if it is provided.
+    if dataset_preprocess_config.split:
+        args.append(f"dataset.split={dataset_preprocess_config.split}")
+
+    # Override the train_on_input field if it is provided.
+    if dataset_preprocess_config.train_on_input:
+        args.append(
+            f"dataset.train_on_input={dataset_preprocess_config.train_on_input}"
+        )
+
+    # Override the new_system_prompt field if it is provided.
+    if dataset_preprocess_config.new_system_prompt:
+        args.append(
+            f"dataset.new_system_prompt={dataset_preprocess_config.new_system_prompt}"
+        )
+
+    # Override the column_map field if it is provided.
+    if dataset_preprocess_config.column_map:
+        args.append(f"dataset.column_map={dataset_preprocess_config.column_map}")
+
+    return args