axolotl-ai-cloud · winglian · Nov 16, 2024 · Nov 13, 2024 · Nov 15, 2024
diff --git a/docs/config.qmd b/docs/config.qmd
@@ -91,6 +91,7 @@ datasets:
     name: # Optional[str] name of dataset configuration to load
     train_on_split: train # Optional[str] name of dataset split to load from
     revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
+    trust_remote_code: # Optional[bool] Trust remote code for untrusted source
 
   # Custom user instruction prompt
   - path: repo

diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py
@@ -260,6 +260,7 @@ def for_d_in_datasets(dataset_configs):
         for config_dataset in for_d_in_datasets(cfg_datasets):
             ds: Optional[Union[Dataset, DatasetDict]] = None
             ds_from_hub = False
+            ds_trust_remote_code = config_dataset.trust_remote_code
             try:
                 # this is just a basic check to see if the path is a
                 # valid HF dataset that's loadable
@@ -269,6 +270,7 @@ def for_d_in_datasets(dataset_configs):
                     streaming=True,
                     token=use_auth_token,
                     revision=config_dataset.revision,
+                    trust_remote_code=ds_trust_remote_code,
                 )
                 ds_from_hub = True
             except (FileNotFoundError, ConnectionError, HFValidationError, ValueError):
@@ -366,14 +368,15 @@ def for_d_in_datasets(dataset_configs):
             elif ds_from_hub:
                 load_ds_kwargs = {}
                 if config_dataset.split:
-                    load_ds_kwargs = {"split": config_dataset.split}
+                    load_ds_kwargs["split"] = config_dataset.split
                 ds = load_dataset(
                     config_dataset.path,
                     name=config_dataset.name,
                     streaming=False,
                     data_files=config_dataset.data_files,
                     token=use_auth_token,
                     revision=config_dataset.revision,
+                    trust_remote_code=config_dataset.trust_remote_code,
                     **load_ds_kwargs,
                 )
             elif ds_from_cloud and remote_file_system:
@@ -391,6 +394,7 @@ def for_d_in_datasets(dataset_configs):
                         streaming=False,
                         split=None,
                         storage_options=storage_options,
+                        trust_remote_code=config_dataset.trust_remote_code,
                     )
             elif config_dataset.path.startswith("https://"):
                 ds_type = get_ds_type(config_dataset)
@@ -401,6 +405,7 @@ def for_d_in_datasets(dataset_configs):
                     streaming=False,
                     split=None,
                     storage_options=storage_options,
+                    trust_remote_code=config_dataset.trust_remote_code,
                 )
             else:
                 if isinstance(config_dataset.data_files, str):