Skip to content

Commit

Permalink
Add text-gen notebook for llama models (#2592)
Browse files Browse the repository at this point in the history
* Add text-gen notebook for llama models

* Update text-gen notebook

* Fixing black formatting issues

* resolve review comments
  • Loading branch information
aggarwal-k authored Sep 7, 2023
1 parent c6ba50d commit 4c1c62f
Show file tree
Hide file tree
Showing 3 changed files with 828 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# import library to parse command line arguments
import argparse, os

parser = argparse.ArgumentParser()
# add an argument to specify a dataset name to download
parser.add_argument("--dataset", type=str, default="samsum", help="dataset name")
# add an argument to specify a dataset name to download
parser.add_argument(
"--dataset_subset", type=str, default="split", help="dataset subset name"
)
# add an argument to specify the directory to download the dataset to
parser.add_argument(
"--download_dir",
type=str,
default="data",
help="directory to download the dataset to",
)
args = parser.parse_args()

# create the download directory if it does not exist
if not os.path.exists(args.download_dir):
os.makedirs(args.download_dir)


# import hugging face datasets library
from datasets import load_dataset, get_dataset_split_names

for split in get_dataset_split_names(args.dataset):
# load the split of the dataset
dataset = load_dataset(args.dataset, split=split)
# save the split of the dataset to the download directory as json lines file
dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl"))
# print dataset features
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{}
Loading

0 comments on commit 4c1c62f

Please sign in to comment.