-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add text-gen notebook for llama models (#2592)
* Add text-gen notebook for llama models * Update text-gen notebook * Fixing black formatting issues * resolve review comments
- Loading branch information
1 parent
c6ba50d
commit 4c1c62f
Showing
3 changed files
with
828 additions
and
0 deletions.
There are no files selected for viewing
33 changes: 33 additions & 0 deletions
33
...hon/foundation-models/system/finetune/Llama-notebooks/text-generation/download-dataset.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# import library to parse command line arguments | ||
import argparse, os | ||
|
||
parser = argparse.ArgumentParser() | ||
# add an argument to specify a dataset name to download | ||
parser.add_argument("--dataset", type=str, default="samsum", help="dataset name") | ||
# add an argument to specify a dataset name to download | ||
parser.add_argument( | ||
"--dataset_subset", type=str, default="split", help="dataset subset name" | ||
) | ||
# add an argument to specify the directory to download the dataset to | ||
parser.add_argument( | ||
"--download_dir", | ||
type=str, | ||
default="data", | ||
help="directory to download the dataset to", | ||
) | ||
args = parser.parse_args() | ||
|
||
# create the download directory if it does not exist | ||
if not os.path.exists(args.download_dir): | ||
os.makedirs(args.download_dir) | ||
|
||
|
||
# import hugging face datasets library | ||
from datasets import load_dataset, get_dataset_split_names | ||
|
||
for split in get_dataset_split_names(args.dataset): | ||
# load the split of the dataset | ||
dataset = load_dataset(args.dataset, split=split) | ||
# save the split of the dataset to the download directory as json lines file | ||
dataset.to_json(os.path.join(args.download_dir, f"{split}.jsonl")) | ||
# print dataset features |
1 change: 1 addition & 0 deletions
1
...dation-models/system/finetune/Llama-notebooks/text-generation/text-generation-config.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{} |
Oops, something went wrong.