From 88089e8b3202a69d1265906cff7d5a6ef1edd749 Mon Sep 17 00:00:00 2001 From: Charles Goddard Date: Fri, 14 Jul 2023 16:46:39 -0700 Subject: [PATCH 1/4] Add ability to pass 'name' argument to load_dataset --- src/axolotl/utils/data.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index eed7d6db11..8df1e4d386 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets( try: load_dataset( d.path, + name=d.name, streaming=True, use_auth_token=use_auth_token, ) @@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets( if local_path.is_dir(): ds = load_dataset( d.path, + name=d.name, data_files=d.data_files, streaming=False, split=None, @@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets( elif local_path.is_file(): ds = load_dataset( "json", + name=d.name, data_files=d.path, streaming=False, split=None, @@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets( "unhandled dataset load: local path exists, but is neither a directory or a file" ) elif ds_from_hub: - if d.data_files: - ds = load_dataset( - d.path, - streaming=False, - data_files=d.data_files, - use_auth_token=use_auth_token, - ) - else: - ds = load_dataset( - d.path, - streaming=False, - use_auth_token=use_auth_token, - ) + ds = load_dataset( + d.path, + name=d.name, + streaming=False, + data_files=d.data_files, + use_auth_token=use_auth_token, + ) else: fp = hf_hub_download( repo_id=d.path, repo_type="dataset", filename=d.data_files, ) - ds = load_dataset("json", data_files=fp, streaming=False, split=None) + ds = load_dataset( + "json", name=d.name, data_files=fp, streaming=False, split=None + ) if not ds: raise ValueError("unhandled dataset load") # support for using a subset of the data From 8bba64258e3436bed371ea4b74e5c4e1d6f35434 Mon Sep 17 00:00:00 2001 From: Charles Goddard Date: Fri, 14 Jul 2023 20:46:21 -0700 Subject: [PATCH 2/4] Add example of dataset with configuration name to README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index ca36acbcb9..d032d1d312 100644 --- a/README.md +++ b/README.md @@ -261,6 +261,12 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic datasets: - path: vicgalle/alpaca-gpt4 type: alpaca # format from earlier + + # huggingface repo with specific configuration/subset + datasets: + - path: EleutherAI/pile + name: enron_emails + type: completion # format from earlier # local datasets: From 46032a1a1fe4e40fbc4175fa30779b08d902f9ce Mon Sep 17 00:00:00 2001 From: Charles Goddard Date: Fri, 14 Jul 2023 20:57:27 -0700 Subject: [PATCH 3/4] Fix formatting mistake --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d032d1d312..6a09454ebc 100644 --- a/README.md +++ b/README.md @@ -261,7 +261,7 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic datasets: - path: vicgalle/alpaca-gpt4 type: alpaca # format from earlier - + # huggingface repo with specific configuration/subset datasets: - path: EleutherAI/pile From 3cdd8e4122a7192b4369f7502df80c40b27e79b4 Mon Sep 17 00:00:00 2001 From: Charles Goddard Date: Sat, 15 Jul 2023 13:17:37 -0700 Subject: [PATCH 4/4] Add dataset name to all yaml options in README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 6a09454ebc..f3dab2c511 100644 --- a/README.md +++ b/README.md @@ -350,6 +350,7 @@ datasets: type: alpaca # format | format: (chat/instruct) | .load_ data_files: # path to source data files shards: # number of shards to split data into + name: # name of dataset configuration to load # axolotl attempts to save the dataset as an arrow after packing the data together so # subsequent training attempts load faster, relative path