From 88089e8b3202a69d1265906cff7d5a6ef1edd749 Mon Sep 17 00:00:00 2001
From: Charles Goddard <chargoddard@gmail.com>
Date: Fri, 14 Jul 2023 16:46:39 -0700
Subject: [PATCH 1/4] Add ability to pass 'name' argument to load_dataset

---
 src/axolotl/utils/data.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index eed7d6db11..8df1e4d386 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
             try:
                 load_dataset(
                     d.path,
+                    name=d.name,
                     streaming=True,
                     use_auth_token=use_auth_token,
                 )
@@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets(
                 if local_path.is_dir():
                     ds = load_dataset(
                         d.path,
+                        name=d.name,
                         data_files=d.data_files,
                         streaming=False,
                         split=None,
@@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets(
                 elif local_path.is_file():
                     ds = load_dataset(
                         "json",
+                        name=d.name,
                         data_files=d.path,
                         streaming=False,
                         split=None,
@@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
                         "unhandled dataset load: local path exists, but is neither a directory or a file"
                     )
             elif ds_from_hub:
-                if d.data_files:
-                    ds = load_dataset(
-                        d.path,
-                        streaming=False,
-                        data_files=d.data_files,
-                        use_auth_token=use_auth_token,
-                    )
-                else:
-                    ds = load_dataset(
-                        d.path,
-                        streaming=False,
-                        use_auth_token=use_auth_token,
-                    )
+                ds = load_dataset(
+                    d.path,
+                    name=d.name,
+                    streaming=False,
+                    data_files=d.data_files,
+                    use_auth_token=use_auth_token,
+                )
             else:
                 fp = hf_hub_download(
                     repo_id=d.path,
                     repo_type="dataset",
                     filename=d.data_files,
                 )
-                ds = load_dataset("json", data_files=fp, streaming=False, split=None)
+                ds = load_dataset(
+                    "json", name=d.name, data_files=fp, streaming=False, split=None
+                )
             if not ds:
                 raise ValueError("unhandled dataset load")
             # support for using a subset of the data

From 8bba64258e3436bed371ea4b74e5c4e1d6f35434 Mon Sep 17 00:00:00 2001
From: Charles Goddard <chargoddard@gmail.com>
Date: Fri, 14 Jul 2023 20:46:21 -0700
Subject: [PATCH 2/4] Add example of dataset with configuration name to README

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index ca36acbcb9..d032d1d312 100644
--- a/README.md
+++ b/README.md
@@ -261,6 +261,12 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
   datasets:
     - path: vicgalle/alpaca-gpt4
       type: alpaca # format from earlier
+  
+  # huggingface repo with specific configuration/subset
+  datasets:
+    - path: EleutherAI/pile
+      name: enron_emails
+      type: completion # format from earlier
 
   # local
   datasets:

From 46032a1a1fe4e40fbc4175fa30779b08d902f9ce Mon Sep 17 00:00:00 2001
From: Charles Goddard <chargoddard@gmail.com>
Date: Fri, 14 Jul 2023 20:57:27 -0700
Subject: [PATCH 3/4] Fix formatting mistake

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d032d1d312..6a09454ebc 100644
--- a/README.md
+++ b/README.md
@@ -261,7 +261,7 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
   datasets:
     - path: vicgalle/alpaca-gpt4
       type: alpaca # format from earlier
-  
+
   # huggingface repo with specific configuration/subset
   datasets:
     - path: EleutherAI/pile

From 3cdd8e4122a7192b4369f7502df80c40b27e79b4 Mon Sep 17 00:00:00 2001
From: Charles Goddard <chargoddard@gmail.com>
Date: Sat, 15 Jul 2023 13:17:37 -0700
Subject: [PATCH 4/4] Add dataset name to all yaml options in README

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6a09454ebc..f3dab2c511 100644
--- a/README.md
+++ b/README.md
@@ -350,6 +350,7 @@ datasets:
     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
     data_files: # path to source data files
     shards: # number of shards to split data into
+    name: # name of dataset configuration to load
 
 # axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path