diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 6010b19b6f..72ca19834b 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -589,7 +589,7 @@ def _process_data_source( ('uc_volume', source_dataset_path[len('dbfs:'):], true_split), ) # Check for HF path - elif 'hf_name' in dataset: + elif 'hf_name' in dataset and dataset['hf_name']: hf_path = dataset['hf_name'] backend, _, _ = parse_uri(hf_path) if backend: @@ -600,7 +600,7 @@ def _process_data_source( else: data_paths.append(('hf', hf_path, true_split)) # Check for remote path - elif 'remote' in dataset: + elif 'remote' in dataset and dataset['remote']: remote_path = dataset['remote'] backend, _, _ = parse_uri(remote_path) if backend: @@ -610,7 +610,11 @@ def _process_data_source( ) if cfg_split else remote_path data_paths.append((backend, remote_path, true_split)) else: + # No backend detected so assume local path data_paths.append(('local', remote_path, true_split)) + # Check for local path + elif 'local' in dataset and dataset['local']: + data_paths.append(('local', dataset['local'], true_split)) else: log.warning('DataSource Not Found.')