Add text-gen notebook for llama models (#2592)

* Add text-gen notebook for llama models * Update text-gen notebook * Fixing black formatting issues * resolve review comments
Azure · Sep 7, 2023 · 4c1c62f · 4c1c62f
1 parent c6ba50d
commit 4c1c62f
Show file tree

Hide file tree

Showing 3 changed files with 828 additions and 0 deletions.
diff --git a/...hon/foundation-models/system/finetune/Llama-notebooks/text-generation/download-dataset.py b/...hon/foundation-models/system/finetune/Llama-notebooks/text-generation/download-dataset.py
@@ -0,0 +1,33 @@
+# import library to parse command line arguments
+import argparse, os
+
+parser = argparse.ArgumentParser()
+# add an argument to specify a dataset name to download
+parser.add_argument("--dataset", type=str, default="samsum", help="dataset name")
+# add an argument to specify a dataset name to download
+parser.add_argument(
+    "--dataset_subset", type=str, default="split", help="dataset subset name"
+)
+# add an argument to specify the directory to download the dataset to
+parser.add_argument(
+    "--download_dir",
+    type=str,
+    default="data",
+    help="directory to download the dataset to",
+)
+args = parser.parse_args()
+
+# create the download directory if it does not exist
+if not os.path.exists(args.download_dir):
+    os.makedirs(args.download_dir)
+
+
+# import hugging face datasets library
+from datasets import load_dataset, get_dataset_split_names
+
+for split in get_dataset_split_names(args.dataset):
+    # load the split of the dataset
+    dataset = load_dataset(args.dataset, split=split)
+    # save the split of the dataset to the download directory as json lines file
+    dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
+    # print dataset features
diff --git a/...dation-models/system/finetune/Llama-notebooks/text-generation/text-generation-config.json b/...dation-models/system/finetune/Llama-notebooks/text-generation/text-generation-config.json
@@ -0,0 +1 @@
+{}