From 8cb6522571dd6b0927ddbac0cd3bcd33e5859d2e Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 22 Dec 2023 22:18:04 -0800 Subject: [PATCH 01/63] add validation script --- .../data_prep/validate_and_tokenize_data.py | 804 ++++++++++++++++++ 1 file changed, 804 insertions(+) create mode 100644 scripts/data_prep/validate_and_tokenize_data.py diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py new file mode 100644 index 0000000000..68b1211f85 --- /dev/null +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -0,0 +1,804 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC JIRA: https://databricks.atlassian.net/jira/software/c/projects/STR/issues/STR-141?filter=allissues + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Warning: Important Alert Regarding the Script Usage +# MAGIC +# MAGIC ### Script Purpose: +# MAGIC - **Not for Training**: This script is not utilized during the training process. +# MAGIC - **Ad-Hoc Validation**: It serves as an ad-hoc utility for users to run independently prior to starting fine-tuning. +# MAGIC - **Data Verification**: Its primary function is to validate the user's data before they invoke the Fine-Tuning (FT) API. +# MAGIC - **Cost Estimation**: Users can estimate the cost implications with this script. +# MAGIC +# MAGIC ### Usage Scenario: +# MAGIC This script is particularly useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process. +# MAGIC +# MAGIC ### Note on Long-Term Solution: +# MAGIC - **Temporary Measure**: This script is a stop-gap solution. +# MAGIC - **Future Development**: We are in the process of developing a long-term data preparation service, which will eventually replace this script. +# MAGIC +# MAGIC ### Checks Include: +# MAGIC - check input dataset: +# MAGIC 1) verify if dataset input format is valid (need to be one of these: Huggingface, delta table, dbfs:/Volumes, cloud path); +# MAGIC - check HF input location: +# MAGIC 1) load dataset info and check if it is accessible; +# MAGIC 2) verify if the split exists. +# MAGIC - check cloud path location: +# MAGIC 1) check the cloud prefix is compliant with composers' object store supports (gs, s3, oci) +# MAGIC 2) check if list objects returns nothing. +# MAGIC - count_tokens: +# MAGIC 1) For IFT task: validate tokenization by running tokenizer + filter on the entire dataset. count the number of tokens. Throws error if there are any empty responses or prompts +# MAGIC 2) For CPT task: call donwload_text_to_mds.py and count the resulted mds dataset. Note this could take a long time. +# MAGIC +# MAGIC ### Questions: +# MAGIC - Is "download_text_to_mds.py" always callable from the validation script? +# MAGIC - what is the function to reuse to run tokenization on HF datasets with filters? +# MAGIC - The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption? +# MAGIC ``` +# MAGIC cfg = { +# MAGIC model: str, +# MAGIC train_data_path: str, +# MAGIC save_folder: str, +# MAGIC *, +# MAGIC task_type: Optional[str] = "INSTRUCTION_FINETUNE", +# MAGIC eval_data_path: Optional[str] = None, +# MAGIC eval_prompts: Optional[List[str]] = None, +# MAGIC custom_weights_path: Optional[str] = None, +# MAGIC training_duration: Optional[str] = None, +# MAGIC learning_rate: Optional[float] = None, +# MAGIC context_length: Optional[int] = None, +# MAGIC experiment_trackers: Optional[List[Dict]] = None, +# MAGIC data_prep_config: Optional[Dict] = None, +# MAGIC disable_credentials_check: Optional[bool] = None, +# MAGIC timeout: Optional[float] = 10, +# MAGIC future: Literal[False] = False, +# MAGIC } +# MAGIC ``` + +# COMMAND ---------- + +# MAGIC %pip install llm-foundry + +# COMMAND ---------- + +dbutils.library.restartPython() + +# COMMAND ---------- + +import os +import re +from enum import Enum +from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) +from torch.utils.data import DataLoader +from streaming import StreamingDataset +import numpy as np +from omegaconf import OmegaConf as om + +# COMMAND ---------- + +FT_API_args = Namespace( + model = 'EleutherAI/gpt-neox-20b', + train_data_path: str, + save_folder: str, + task_type: Optional[str] = "INSTRUCTION_FINETUNE", + eval_data_path = None, + eval_prompts = None, + custom_weights_path = None, + training_duration = None, + learning_rate = None, + context_length = None, + experiment_trackers = None, + disable_credentials_check = None, + # Extra argument to add to FT API + # See comment https://databricks.atlassian.net/browse/STR-141?focusedCommentId=4308948 + data_prep_config = {'data_validation': True, 'data_prep': False}, + timeout = 10, + future = False, +) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Utility Functions + +# COMMAND ---------- + +def check_HF_datasets(dataset_names_with_splits): + from huggingface_hub import dataset_info + from datasets import get_dataset_split_names + import os + token = os.environ.get("HUGGING_FACE_HUB_TOKEN") + for dataset_name_with_split in dataset_names_with_splits: + dataset_name, split = os.path.split(dataset_name_with_split) + # make sure we have a dataset and split + if not dataset_name or not split: + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + # check user access to the dataset + try: + info = dataset_info(dataset_name) + except: + token_warning = "" + if not token: + token_warning = " If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf." + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning + # check that split exists + try: + splits = get_dataset_split_names(dataset_name) + except: # error raised in the case of multiple subsets + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets." + if split not in splits: + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." + return True, "" + +# COMMAND ---------- + +def integrity_check(out: Union[str, Tuple[str, str]]): + """Check if the index file has integrity. + + If index is a cloud url, first download it to a temp local file. + + Args: + out (Union[str, Tuple[str,str]]): MDS dataset path + """ + + def get_expected(mds_root: str): + n_shard_files = 0 + cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) + for o in cu.list_objects(): + if o.endswith('.mds'): + n_shard_files += 1 + return n_shard_files + + cu = CloudUploader.get(out, keep_local=True, exist_ok=True) + + with tempfile.TemporaryDirectory() as temp_dir: + if cu.remote: + download_file(os.path.join(cu.remote, 'index.json'), + os.path.join(temp_dir, 'index.json'), + timeout=60) + expected_n_shard_files = get_expected(cu.remote) + local_merged_index_path = os.path.join(temp_dir, 'index.json') + else: + local_merged_index_path = os.path.join(cu.local, 'index.json') + expected_n_shard_files = get_expected(cu.local) + + merged_index = json.load(open(local_merged_index_path, 'r')) + n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) + assert n_shard_files == expected_n_shard_files, f'expected {expected_n_shard_files} shard files but got {n_shard_files}' + +# COMMAND ---------- + +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +# Taken from llmfoundry/scripts/data_prep/convert_text_to_mds.py + +import logging +import math +import os +import tempfile +from argparse import ArgumentParser, Namespace +from concurrent.futures import ProcessPoolExecutor +from glob import glob +from typing import Iterable, List, Tuple, cast + +import psutil +from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, + parse_uri) +from streaming import MDSWriter +from tqdm import tqdm +from transformers import AutoTokenizer + +from llmfoundry.data import ConcatTokensDataset +from llmfoundry.utils.data_prep_utils import (DownloadingIterable, + merge_shard_groups) + +log = logging.getLogger(__name__) +DONE_FILENAME = '.text_to_mds_conversion_done' + + +def parse_args( tokenizer, + concat_tokens, + output_folder, + input_folder, + compression = 'zstd', + bos_text = '', + eos_text = '', + no_wrap = False , + processes = 32, # min(max(psutil.cpu_count() - 2, 1), 32), + reprocess = False ) -> Namespace: + + parsed = Namespace(tokenizer = tokenizer, + concat_tokens = model_max_length, + output_folder = output_folder, + input_folder = input_folder, + eos_text = eos_text, + bos_text = bos_text, + no_wrap = no_wrap, + compression = compression, + processes = processes, + reprocess = reprocess) + + # Make sure we have needed concat options + if (parsed.concat_tokens is not None and + isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None): + parser.error( + 'When setting --concat_tokens, you must specify a --tokenizer') + + # now that we have validated them, change BOS/EOS to strings + if parsed.bos_text is None: + parsed.bos_text = '' + if parsed.eos_text is None: + parsed.eos_text = '' + return parsed + + +def get_object_names(input_folder: str) -> List[str]: + """Get object names from a local or remote folder. + + Args: + input_folder (str): local or remote folder path. + """ + object_store = maybe_create_object_store_from_uri(input_folder) + if object_store is not None: + _, _, folder_prefix = parse_uri(input_folder) + names = [ + name for name in object_store.list_objects(folder_prefix) + if name.endswith('.txt') + ] + else: + # input_folder is a local folder + names = [ + text_file for dirpath, _, _ in os.walk(input_folder) + for text_file in glob(os.path.join(dirpath, '*.txt')) + ] + # return names, sizes + log.info(f'Found {len(names)} text files at {input_folder}') + + return names + + +def get_task_args( + object_names: List[str], + output_root: str, + input_folder: str, + n_groups: int, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, +) -> Iterable: + """Get download_and_convert arguments split across n_groups. + + Each group handles a portion of object_names. + + Args: + object_names (List[str]): Names of objects to process + output_root (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + n_groups (int): Number of groups to split the object names into + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concantenate up to this many tokens + eos_text (str): Textend to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + """ + num_objects = len(object_names) + objs_per_group = math.ceil(num_objects / n_groups) + for group, i in enumerate(range(0, num_objects, objs_per_group)): + output_subdir = os.path.join(output_root, str(group)) + yield ( + object_names[i:min(i + objs_per_group, num_objects)], + output_subdir, + input_folder, + tokenizer_name, + concat_tokens, + eos_text, + bos_text, + no_wrap, + compression, + ) + + +def download_and_convert_starargs(args: Tuple): + """Helper function to call download_and_convert with star args. + + This helps us use download_and_convert with mutiprocessing. + """ + return download_and_convert(*args) + + +def download_and_convert( + file_names: List[str], + output_folder: str, + input_folder: str, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, +): + """Downloads and converts text fies to MDS format. + + Args: + file_names (List[str]): Files to process + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concantenate up to this many tokens + eos_text (str): Textend to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + """ + object_store = maybe_create_object_store_from_uri(input_folder) + + # Download file_names + with tempfile.TemporaryDirectory() as tmp_dir: + downloading_iter = DownloadingIterable(object_names=file_names, + output_folder=tmp_dir, + object_store=object_store) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace + + # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up + # to the maximum sequence length + dataset = ConcatTokensDataset( + hf_dataset=downloading_iter, + max_length=concat_tokens, + tokenizer=tokenizer, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + ) + + columns = {'tokens': 'bytes'} + + log.info('Converting to MDS format...') + with MDSWriter(out=output_folder, + columns=columns, + compression=compression) as out: + for sample in tqdm(dataset): + out.write(sample) + + +def is_remote_path(path: str) -> bool: + """Checks whether a path is a remote path. + + Args: + path (str): path to check + """ + backend, _, _ = parse_uri(path) + return backend != '' + + +def is_already_processed(output_root: str, args_str: str, + object_names: List[str]) -> bool: + """Determines whether a group of text files has already been processed. + + Checks the done fie at output root to determine this. + + Args: + output_root (str): Output folder where a done file may exist + args_str (str): String representation of the arguments + object_names (List[str]): Names of objects to convert to MDS format + """ + # Retrieve the done file contents + output_object_store = maybe_create_object_store_from_uri(output_root) + if output_object_store is not None: + # Download and read the done file from the remote object store + _, _, output_folder_prefix = parse_uri(output_root) + try: + with tempfile.TemporaryDirectory() as tmp_dir: + done_file = os.path.join(tmp_dir, DONE_FILENAME) + output_object_store.download_object( + os.path.join(output_folder_prefix, DONE_FILENAME), + done_file) + with open(done_file) as df: + done_file_contents = df.read().splitlines() + except FileNotFoundError: + return False + else: + # Read the local done file + done_file = os.path.join(output_root, DONE_FILENAME) + if not os.path.isfile(done_file): + return False + with open(done_file) as df: + done_file_contents = df.read().splitlines() + # Compare the arguments + prev_args_str = done_file_contents[0] + if prev_args_str != args_str: + return False + + # Compare file names + prev_names = done_file_contents[1:] + if len(prev_names) != len(object_names): + return False + for idx, prev_name in enumerate(prev_names): + if object_names[idx] != prev_name: + return False + return True + + +def write_done_file(folder: str, args_str: str, object_names: List[str]): + """Write a file to signify completion. + + This the done file includes the arguments to processing and + a list of objects that were processed. + + Args: + folder (str): Folder to write the done file to + args_str (str): String representation of arguments + object_names (List[str]): List of objects to convert to MDS format + """ + with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: + done_file.write('\n'.join([args_str] + object_names) + '\n') + + +def convert_text_to_mds( + tokenizer_name: str, + output_folder: str, + input_folder: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, + processes: int, + args_str: str, + reprocess: bool, +): + """Convert a folder of text files to MDS format. + + Args: + tokenizer_name (str): Name of tokenizer to use + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + concat_tokens (int): Concantenate up to this many tokens + eos_text (str): Textend to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + processes (int): The number of processes to use. + args_str (str): String representation of the arguments + reprocess (bool): Whether to always reprocess the given folder of text files + """ + is_remote_output = is_remote_path(output_folder) + + object_names = get_object_names(input_folder) + if len(object_names) == 0: + raise ValueError(f'No text files were found at {input_folder}.') + + # Check if the text files in the bucket have already been processed. + if not reprocess and is_already_processed(output_folder, args_str, + object_names): + log.info( + f'Input folder {input_folder} is already processed at {output_folder} and ' + + + 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.' + ) + return + + # Use a temporary local directory if the output is remote and there are more than 1 processes + local_output_folder = tempfile.TemporaryDirectory( + ).name if is_remote_output else output_folder + + if processes > 1: + # Download and convert the text files in parallel + args = get_task_args(object_names, local_output_folder, input_folder, + processes, tokenizer_name, concat_tokens, eos_text, + bos_text, no_wrap, compression) + with ProcessPoolExecutor(max_workers=processes) as executor: + list(executor.map(download_and_convert_starargs, args)) + + # Merge the mds shards from each of the processes into a single folder + merge_shard_groups(local_output_folder) + else: + download_and_convert(object_names, local_output_folder, input_folder, + tokenizer_name, concat_tokens, eos_text, bos_text, + no_wrap, compression) + + # Write a done file with the args and object names + write_done_file(local_output_folder, args_str, object_names) + + if is_remote_output: + # Upload the local output to the remote location + output_object_store = cast( + ObjectStore, maybe_create_object_store_from_uri(output_folder)) + _, _, output_folder_prefix = parse_uri(output_folder) + files_to_upload = os.listdir(local_output_folder) + + for file in files_to_upload: + assert not os.path.isdir(file) + remote_path = os.path.join(output_folder_prefix, file) + output_object_store.upload_object( + remote_path, os.path.join(local_output_folder, file)) + + +def _args_str(original_args: Namespace) -> str: + """Create a string from the args to determine whether to reprocess. + + Args: + original_args (Namespace): Arguments to main function. + """ + # Take the arguments that influence the final result. + # reprocess and max_mds_writer_workers are not taken. + args = Namespace( + tokenizer_name=original_args.tokenizer, + output_folder=original_args.output_folder, + input_folder=original_args.input_folder, + concat_tokens=original_args.concat_tokens, + eos_text=original_args.eos_text, + bos_text=original_args.bos_text, + no_wrap=original_args.no_wrap, + compression=original_args.compression, + processes=original_args.processes, + ) + + return str(args) + + +# COMMAND ---------- + +def is_hf_dataset_path(path): + """Check if a given string is a dataset path used by Hugging Face. + + Args: + path (str): The string to be checked. + + Returns: + bool: True if the string is a dataset path, False otherwise. + """ + # Regular expression to match the dataset path pattern + pattern = r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+(/[\w]+)?/?$" + + return bool(re.match(pattern, path)) + + +def create_om_cfg(FT_API_args): + task_type = FT_API_args.task_type + train_data_path = FT_API_args.train_data_path + model = FT_API_args.model + max_seq_len = FT_API_args.context_length + + common_args = { + 'drop_last': False, + 'num_workers': 0, + 'prefetch_factor': None, + 'pin_memory': False, + 'persistent_workers': False, + 'timeout': 0 + } + if task == 'INSTRUCTION_FINETUNE': + cfg = om.create({ + 'dataset': { + 'hf_name': train_data_path, + 'split': 'train', + 'max_seq_len': max_seq_len, + 'decoder_only_format': True, + 'allow_pad_trimming': False, + 'shuffle': True, + }, + **common_args + }) + + else: + cfg = om.create({ + 'name': 'finetuning', + 'dataset': { + 'remote': train_data_path, + 'local': train_data_path, + 'split': 'train', + 'max_seq_len': max_seq_len, + 'decoder_only_format': True, + 'allow_pad_trimming': False, + 'packing_ratio': None, + 'shuffle': True, + }, + **common_args + }) + + tokenizer = build_tokenizer( + tokenizer_name=model, + tokenizer_kwargs={'model_max_length': max_seq_len}, + ) + + return cfg, tokenizer + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Validate and token Count + +# COMMAND ---------- + +if task_type == 'INSTRUCTION_FINETUNE': + # check if train_data_path is a valid HF dataset url with splits. + # load dataset.info and see if HF tokens are correctly set. + check_HF_datasets() + +elif task_type == 'CONTINUED_PRETRAIN': + # check if train_data_path is a valid object store that composer supports + + # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" + args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) + convert_text_to_mds(tokenizer_name=args.tokenizer, + output_folder=args.output_folder, + input_folder=args.input_folder, + concat_tokens=args.concat_tokens, + eos_text=args.eos_text, + bos_text=args.bos_text, + no_wrap=args.no_wrap, + compression=args.compression, + processes=args.processes, + reprocess=args.reprocess, + args_str=_args_str(args)) + +else: + raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {task_type} instead!") + # Run a few checks on resulted MDS datasets + # 1. no shards in output_folder + # 2. check shard completeness by downloading and inspecting index.json + +import torch +from omegaconf import OmegaConf as om +from llmfoundry.utils import build_tokenizer + +# build cfg from the inputs + +from llmfoundry.data.finetuning import build_finetuning_dataloader +tokenizer_name = 'EleutherAI/gpt-neox-20b' +tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} +tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + +device_batch_size = 1 +dataloader = build_finetuning_dataloader(cfg, tokenizer, + device_batch_size).dataloader + +total_tokens = 0 +for batch in dataloader: + if len(batch['input_ids']) == 0 (check labels as well if exist): + raise Error + + batch_tokens = batch['input_ids'] (add 'labels' as well if exist) + batch_token_count = sum(len(tokens) for tokens in batch_tokens) + total_tokens += batch_token_count + +print("Total number of tokens:", total_tokens) + +# COMMAND ---------- + + + +# COMMAND ---------- + + + +# COMMAND ---------- + + + +# COMMAND ---------- + + + +# COMMAND ---------- + + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # DEPRECATED BELOW + +# COMMAND ---------- + +# If running on databricks notebook, the url can only be a Volume path. +# Make sure this is compliant to https://github.com/mosaicml/llm-foundry/blob/1191267195367b5ec6093ed7854b8f6daf1be2d3/llmfoundry/data/text_data.py#L174-L178 + +# raw dataset location you will point FT API to. +# It can be a local path or a remote path (s3/gcs/oci/dbfs:Volume) +dataset_url = 'tatsu-lab/alpaca' # "s3://xxxxx" or "HF name" +preprocessing_fn = 'llmfoundry.data.finetuning.tasks:alpaca_preprocessing_function' + +# dataset schema with tokens +tokenized_table_schema = {'tokens': bytes, 'id': np.int64} +tokenizer = 'EleutherAI/gpt-neox-20b' +tokenizer_kwargs = {'model_max_length': 2048} + +output_folder ='/Volumes/main/mosaic_hackathon/managed-volume/output' +input_folder = '' +eos_text = '<|endoftext|>' + + +# COMMAND ---------- + +if not dataset_url: + raise ValueError("dataset_url needs to be set at this point!") + +# COMMAND ---------- + + + +# COMMAND ---------- + +def check_cloud_datasets(dataset_url, job): + + suffix = '.txt' if job==Job.CPT else '.jsonl' + + object_store = maybe_create_object_store_from_uri(dataset_url) + + if object_store is not None: + _, _, folder_prefix = parse_uri(dataset_url) + names = [ + name for name in object_store.list_objects(folder_prefix) + if name.endswith(suffix) + ] + else: + # input_folder is a local folder + names = [ + text_file for dirpath, _, _ in os.walk(input_folder) + for text_file in glob(os.path.join(dirpath, '*.' + suffix)) + ] + assert len(names) > 0, f"No {suffix} files found in {dataset_url}." + return names + +check_cloud_datasets(dataset_url, job) + + +# COMMAND ---------- + +def validate_and_count_tokens(dataset_url, cfg, job): + if job == Job.IFT: + # for IFT, basic data processing to see (1) well-formed JSONL and (2) strip of empty tokens + import torch + from omegaconf import OmegaConf as om + from llmfoundry.utils import build_tokenizer + + tokenizer_name = 'EleutherAI/gpt-neox-20b' + tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + device_batch_size = 2 + dataloader = build_finetuning_dataloader(cfg, tokenizer, + device_batch_size).dataloader + + packing = cfg.dataset.get('packing_ratio') is not None + + for i, batch in enumerate(dataloader): + if i >= 5: + break + print(f'-----Batch {i}-----') + for k, v in batch.items(): + if isinstance(v, torch.Tensor): + print(k, v.shape) + else: + print(k, v) + else: # job == Job.CPT: + # for CPT, strip empty txt files + print("Make sure the script is running within llmfoundry") + convert_text_to_mds(tokenizer = tokenizer , + concat_tokens = tokenizer_kwargs['model_max_length'], + output_folder = output_folder, + input_folder = input_folder, + eos_text = '<|endoftext|>') + + dataset=StreamingDataset(local='/Volumes/datasets/default/byod/cpt_poc/output/') # output has the streaming shards + dataloader = DataLoader(dataset) + sample = next(iter(dataloader)) + b = np.asarray(sample['tokens']).tobytes() + token_ids = np.frombuffer(b, dtype=np.int64) + n_token_per_sample = len(token_ids) + print('total_tokens = ', n_token_per_sample * dataset.num_samples) + +validate_and_count_tokens() # print overall stats of dataset + +# COMMAND ---------- + + From c59c11fdd954ad49f3ff4c77a598f3a211ced32b Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Tue, 2 Jan 2024 22:09:30 -0800 Subject: [PATCH 02/63] update --- .../data_prep/validate_and_tokenize_data.py | 244 +++++------------- 1 file changed, 66 insertions(+), 178 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 68b1211f85..9dbb78de3b 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -39,22 +39,22 @@ # MAGIC - The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption? # MAGIC ``` # MAGIC cfg = { -# MAGIC model: str, -# MAGIC train_data_path: str, -# MAGIC save_folder: str, -# MAGIC *, -# MAGIC task_type: Optional[str] = "INSTRUCTION_FINETUNE", -# MAGIC eval_data_path: Optional[str] = None, -# MAGIC eval_prompts: Optional[List[str]] = None, -# MAGIC custom_weights_path: Optional[str] = None, -# MAGIC training_duration: Optional[str] = None, -# MAGIC learning_rate: Optional[float] = None, -# MAGIC context_length: Optional[int] = None, -# MAGIC experiment_trackers: Optional[List[Dict]] = None, -# MAGIC data_prep_config: Optional[Dict] = None, -# MAGIC disable_credentials_check: Optional[bool] = None, -# MAGIC timeout: Optional[float] = 10, -# MAGIC future: Literal[False] = False, +# MAGIC model: str, +# MAGIC train_data_path: str, +# MAGIC save_folder: str, +# MAGIC *, +# MAGIC task_type: Optional[str] = "INSTRUCTION_FINETUNE", +# MAGIC eval_data_path: Optional[str] = None, +# MAGIC eval_prompts: Optional[List[str]] = None, +# MAGIC custom_weights_path: Optional[str] = None, +# MAGIC training_duration: Optional[str] = None, +# MAGIC learning_rate: Optional[float] = None, +# MAGIC context_length: Optional[int] = None, +# MAGIC experiment_trackers: Optional[List[Dict]] = None, +# MAGIC data_prep_config: Optional[Dict] = None, +# MAGIC disable_credentials_check: Optional[bool] = None, +# MAGIC timeout: Optional[float] = 10, +# MAGIC future: Literal[False] = False, # MAGIC } # MAGIC ``` @@ -76,20 +76,24 @@ from streaming import StreamingDataset import numpy as np from omegaconf import OmegaConf as om +from argparse import Namespace +from typing import Union, Tuple +from llmfoundry.utils import build_tokenizer +import torch # COMMAND ---------- FT_API_args = Namespace( model = 'EleutherAI/gpt-neox-20b', - train_data_path: str, - save_folder: str, - task_type: Optional[str] = "INSTRUCTION_FINETUNE", + train_data_path = 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', + save_folder = 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', + task_type = "INSTRUCTION_FINETUNE", eval_data_path = None, eval_prompts = None, custom_weights_path = None, training_duration = None, learning_rate = None, - context_length = None, + context_length = 2048, experiment_trackers = None, disable_credentials_check = None, # Extra argument to add to FT API @@ -562,6 +566,21 @@ def is_hf_dataset_path(path): return bool(re.match(pattern, path)) + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Validate and token Count + +# COMMAND ---------- + +os.environ['HF_ASSETS_CACHE'] = '/tmp/' +os.environ['HF_HOME'] = '/tmp/' +os.environ['HF_HUB_CACHE'] = '/tmp/' +os.environ['HF_DATASETS_CACHE'] = '/tmp/' + + def create_om_cfg(FT_API_args): task_type = FT_API_args.task_type train_data_path = FT_API_args.train_data_path @@ -570,13 +589,13 @@ def create_om_cfg(FT_API_args): common_args = { 'drop_last': False, - 'num_workers': 0, - 'prefetch_factor': None, + 'num_workers': 2, + 'prefetch_factor': 2, 'pin_memory': False, 'persistent_workers': False, 'timeout': 0 } - if task == 'INSTRUCTION_FINETUNE': + if task_type == 'INSTRUCTION_FINETUNE': cfg = om.create({ 'dataset': { 'hf_name': train_data_path, @@ -609,25 +628,30 @@ def create_om_cfg(FT_API_args): tokenizer_name=model, tokenizer_kwargs={'model_max_length': max_seq_len}, ) - + return cfg, tokenizer # COMMAND ---------- -# MAGIC %md -# MAGIC ## Validate and token Count - -# COMMAND ---------- - -if task_type == 'INSTRUCTION_FINETUNE': - # check if train_data_path is a valid HF dataset url with splits. - # load dataset.info and see if HF tokens are correctly set. - check_HF_datasets() - -elif task_type == 'CONTINUED_PRETRAIN': - # check if train_data_path is a valid object store that composer supports +# build cfg from the inputs - # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" +if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': + # check if train_data_path is a valid HF dataset url with splits. + # load dataset.info and see if HF tokens are correctly set. + # check_HF_datasets() + + cfg, tokenizer = create_om_cfg(FT_API_args) + +elif FT_API_args.task_type == 'CONTINUED_PRETRAIN': + # check if train_data_path is a valid object store that composer supports + cfg, tokenizer = create_om_cfg(FT_API_args) + + input_folder = FT_API_args.train_data_path + output_folder = FT_API_args.save_folder + concat_tokens = FT_API_args.context_length + tokenizer_name = FT_API_args.model + + # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) convert_text_to_mds(tokenizer_name=args.tokenizer, output_folder=args.output_folder, @@ -640,18 +664,12 @@ def create_om_cfg(FT_API_args): processes=args.processes, reprocess=args.reprocess, args_str=_args_str(args)) - else: - raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {task_type} instead!") + raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") # Run a few checks on resulted MDS datasets # 1. no shards in output_folder # 2. check shard completeness by downloading and inspecting index.json -import torch -from omegaconf import OmegaConf as om -from llmfoundry.utils import build_tokenizer - -# build cfg from the inputs from llmfoundry.data.finetuning import build_finetuning_dataloader tokenizer_name = 'EleutherAI/gpt-neox-20b' @@ -664,141 +682,11 @@ def create_om_cfg(FT_API_args): total_tokens = 0 for batch in dataloader: - if len(batch['input_ids']) == 0 (check labels as well if exist): - raise Error + if len(batch['input_ids']) == 0: # (check labels as well if exist): + raise ValueError('input_ids is empty') - batch_tokens = batch['input_ids'] (add 'labels' as well if exist) - batch_token_count = sum(len(tokens) for tokens in batch_tokens) + batch_tokens = batch['input_ids'] # (add 'labels' as well if exist) + batch_token_count = sum([len(tokens) for tokens in batch_tokens]) total_tokens += batch_token_count print("Total number of tokens:", total_tokens) - -# COMMAND ---------- - - - -# COMMAND ---------- - - - -# COMMAND ---------- - - - -# COMMAND ---------- - - - -# COMMAND ---------- - - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC # DEPRECATED BELOW - -# COMMAND ---------- - -# If running on databricks notebook, the url can only be a Volume path. -# Make sure this is compliant to https://github.com/mosaicml/llm-foundry/blob/1191267195367b5ec6093ed7854b8f6daf1be2d3/llmfoundry/data/text_data.py#L174-L178 - -# raw dataset location you will point FT API to. -# It can be a local path or a remote path (s3/gcs/oci/dbfs:Volume) -dataset_url = 'tatsu-lab/alpaca' # "s3://xxxxx" or "HF name" -preprocessing_fn = 'llmfoundry.data.finetuning.tasks:alpaca_preprocessing_function' - -# dataset schema with tokens -tokenized_table_schema = {'tokens': bytes, 'id': np.int64} -tokenizer = 'EleutherAI/gpt-neox-20b' -tokenizer_kwargs = {'model_max_length': 2048} - -output_folder ='/Volumes/main/mosaic_hackathon/managed-volume/output' -input_folder = '' -eos_text = '<|endoftext|>' - - -# COMMAND ---------- - -if not dataset_url: - raise ValueError("dataset_url needs to be set at this point!") - -# COMMAND ---------- - - - -# COMMAND ---------- - -def check_cloud_datasets(dataset_url, job): - - suffix = '.txt' if job==Job.CPT else '.jsonl' - - object_store = maybe_create_object_store_from_uri(dataset_url) - - if object_store is not None: - _, _, folder_prefix = parse_uri(dataset_url) - names = [ - name for name in object_store.list_objects(folder_prefix) - if name.endswith(suffix) - ] - else: - # input_folder is a local folder - names = [ - text_file for dirpath, _, _ in os.walk(input_folder) - for text_file in glob(os.path.join(dirpath, '*.' + suffix)) - ] - assert len(names) > 0, f"No {suffix} files found in {dataset_url}." - return names - -check_cloud_datasets(dataset_url, job) - - -# COMMAND ---------- - -def validate_and_count_tokens(dataset_url, cfg, job): - if job == Job.IFT: - # for IFT, basic data processing to see (1) well-formed JSONL and (2) strip of empty tokens - import torch - from omegaconf import OmegaConf as om - from llmfoundry.utils import build_tokenizer - - tokenizer_name = 'EleutherAI/gpt-neox-20b' - tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} - tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - device_batch_size = 2 - dataloader = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader - - packing = cfg.dataset.get('packing_ratio') is not None - - for i, batch in enumerate(dataloader): - if i >= 5: - break - print(f'-----Batch {i}-----') - for k, v in batch.items(): - if isinstance(v, torch.Tensor): - print(k, v.shape) - else: - print(k, v) - else: # job == Job.CPT: - # for CPT, strip empty txt files - print("Make sure the script is running within llmfoundry") - convert_text_to_mds(tokenizer = tokenizer , - concat_tokens = tokenizer_kwargs['model_max_length'], - output_folder = output_folder, - input_folder = input_folder, - eos_text = '<|endoftext|>') - - dataset=StreamingDataset(local='/Volumes/datasets/default/byod/cpt_poc/output/') # output has the streaming shards - dataloader = DataLoader(dataset) - sample = next(iter(dataloader)) - b = np.asarray(sample['tokens']).tobytes() - token_ids = np.frombuffer(b, dtype=np.int64) - n_token_per_sample = len(token_ids) - print('total_tokens = ', n_token_per_sample * dataset.num_samples) - -validate_and_count_tokens() # print overall stats of dataset - -# COMMAND ---------- - - From 66f34ebd72b156451e240f5da3b76e9a9c444b30 Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Wed, 3 Jan 2024 07:51:25 +0000 Subject: [PATCH 03/63] change token count function --- .../data_prep/validate_and_tokenize_data.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 9dbb78de3b..6e4364bee6 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -139,6 +139,10 @@ def check_HF_datasets(dataset_names_with_splits): # COMMAND ---------- +from streaming.base.storage.upload import CloudUploader +from streaming.base.storage.download import download_file +import json + def integrity_check(out: Union[str, Tuple[str, str]]): """Check if the index file has integrity. @@ -565,9 +569,6 @@ def is_hf_dataset_path(path): return bool(re.match(pattern, path)) - - - # COMMAND ---------- # MAGIC %md @@ -677,16 +678,23 @@ def create_om_cfg(FT_API_args): tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) device_batch_size = 1 -dataloader = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader +dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) +dataloader = dataspec.dataloader +token_counting_func = dataspec.get_num_tokens_in_batch total_tokens = 0 for batch in dataloader: - if len(batch['input_ids']) == 0: # (check labels as well if exist): - raise ValueError('input_ids is empty') + total_tokens += token_counting_func(batch) + + # if len(batch['input_ids']) == 0: # (check labels as well if exist): + # raise ValueError('input_ids is empty') - batch_tokens = batch['input_ids'] # (add 'labels' as well if exist) - batch_token_count = sum([len(tokens) for tokens in batch_tokens]) - total_tokens += batch_token_count + # batch_tokens = batch['input_ids'] # (add 'labels' as well if exist) + # batch_token_count = sum([len(tokens) for tokens in batch_tokens]) + # total_tokens += batch_token_count print("Total number of tokens:", total_tokens) + +# COMMAND ---------- + + From 2cd387b50108c061b4dca1ef8907b0fa77988236 Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Fri, 5 Jan 2024 06:34:53 +0000 Subject: [PATCH 04/63] reorganize cells --- .../data_prep/validate_and_tokenize_data.py | 180 +++++++++--------- 1 file changed, 89 insertions(+), 91 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 6e4364bee6..b81811a5fb 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -83,6 +83,11 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC ## User Defines the Cell Below + +# COMMAND ---------- + FT_API_args = Namespace( model = 'EleutherAI/gpt-neox-20b', train_data_path = 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', @@ -103,79 +108,15 @@ future = False, ) -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Utility Functions - -# COMMAND ---------- - -def check_HF_datasets(dataset_names_with_splits): - from huggingface_hub import dataset_info - from datasets import get_dataset_split_names - import os - token = os.environ.get("HUGGING_FACE_HUB_TOKEN") - for dataset_name_with_split in dataset_names_with_splits: - dataset_name, split = os.path.split(dataset_name_with_split) - # make sure we have a dataset and split - if not dataset_name or not split: - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." - # check user access to the dataset - try: - info = dataset_info(dataset_name) - except: - token_warning = "" - if not token: - token_warning = " If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf." - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning - # check that split exists - try: - splits = get_dataset_split_names(dataset_name) - except: # error raised in the case of multiple subsets - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets." - if split not in splits: - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." - return True, "" +os.environ['HF_ASSETS_CACHE'] = '/tmp/' +os.environ['HF_HOME'] = '/tmp/' +os.environ['HF_HUB_CACHE'] = '/tmp/' +os.environ['HF_DATASETS_CACHE'] = '/tmp/' # COMMAND ---------- -from streaming.base.storage.upload import CloudUploader -from streaming.base.storage.download import download_file -import json - -def integrity_check(out: Union[str, Tuple[str, str]]): - """Check if the index file has integrity. - - If index is a cloud url, first download it to a temp local file. - - Args: - out (Union[str, Tuple[str,str]]): MDS dataset path - """ - - def get_expected(mds_root: str): - n_shard_files = 0 - cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) - for o in cu.list_objects(): - if o.endswith('.mds'): - n_shard_files += 1 - return n_shard_files - - cu = CloudUploader.get(out, keep_local=True, exist_ok=True) - - with tempfile.TemporaryDirectory() as temp_dir: - if cu.remote: - download_file(os.path.join(cu.remote, 'index.json'), - os.path.join(temp_dir, 'index.json'), - timeout=60) - expected_n_shard_files = get_expected(cu.remote) - local_merged_index_path = os.path.join(temp_dir, 'index.json') - else: - local_merged_index_path = os.path.join(cu.local, 'index.json') - expected_n_shard_files = get_expected(cu.local) - - merged_index = json.load(open(local_merged_index_path, 'r')) - n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) - assert n_shard_files == expected_n_shard_files, f'expected {expected_n_shard_files} shard files but got {n_shard_files}' +# MAGIC %md +# MAGIC ## Adapted from llmfoundry/scripts/data_prep/convert_text_to_mds.py # COMMAND ---------- @@ -555,6 +496,76 @@ def _args_str(original_args: Namespace) -> str: # COMMAND ---------- +# MAGIC %md +# MAGIC ## Validate Inputs and Count tokens + +# COMMAND ---------- + +from streaming.base.storage.upload import CloudUploader +from streaming.base.storage.download import download_file +import json + +def integrity_check(out: Union[str, Tuple[str, str]]): + """Check if the index file has integrity. + + If index is a cloud url, first download it to a temp local file. + + Args: + out (Union[str, Tuple[str,str]]): MDS dataset path + """ + + def count_shards(mds_root: str): + n_shard_files = 0 + cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) + for o in cu.list_objects(): + if o.endswith('.mds'): + n_shard_files += 1 + return n_shard_files + + cu = CloudUploader.get(out, keep_local=True, exist_ok=True) + + with tempfile.TemporaryDirectory() as temp_dir: + if cu.remote: + download_file(os.path.join(cu.remote, 'index.json'), + os.path.join(temp_dir, 'index.json'), + timeout=60) + actual_n_shard_files = count_shards(cu.remote) + local_merged_index_path = os.path.join(temp_dir, 'index.json') + else: + local_merged_index_path = os.path.join(cu.local, 'index.json') + actual_n_shard_files = count_shards(cu.local) + + merged_index = json.load(open(local_merged_index_path, 'r')) + n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) + return n_shard_files == actual_n_shard_files + +def check_HF_datasets(dataset_names_with_splits): + from huggingface_hub import dataset_info + from datasets import get_dataset_split_names + import os + token = os.environ.get("HUGGING_FACE_HUB_TOKEN") + for dataset_name_with_split in dataset_names_with_splits: + dataset_name, split = os.path.split(dataset_name_with_split) + # make sure we have a dataset and split + if not dataset_name or not split: + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + # check user access to the dataset + try: + info = dataset_info(dataset_name) + except: + token_warning = "" + if not token: + token_warning = " If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf." + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning + # check that split exists + try: + splits = get_dataset_split_names(dataset_name) + except: # error raised in the case of multiple subsets + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets." + if split not in splits: + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." + return True, "" + def is_hf_dataset_path(path): """Check if a given string is a dataset path used by Hugging Face. @@ -569,19 +580,6 @@ def is_hf_dataset_path(path): return bool(re.match(pattern, path)) -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Validate and token Count - -# COMMAND ---------- - -os.environ['HF_ASSETS_CACHE'] = '/tmp/' -os.environ['HF_HOME'] = '/tmp/' -os.environ['HF_HUB_CACHE'] = '/tmp/' -os.environ['HF_DATASETS_CACHE'] = '/tmp/' - - def create_om_cfg(FT_API_args): task_type = FT_API_args.task_type train_data_path = FT_API_args.train_data_path @@ -638,8 +636,10 @@ def create_om_cfg(FT_API_args): if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': # check if train_data_path is a valid HF dataset url with splits. + if not is_hf_dataset_path(FT_API_args.train_data_path): + raise ValueError(f"Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.") # load dataset.info and see if HF tokens are correctly set. - # check_HF_datasets() + check_HF_datasets(FT_API_args.train_data_path) cfg, tokenizer = create_om_cfg(FT_API_args) @@ -665,6 +665,11 @@ def create_om_cfg(FT_API_args): processes=args.processes, reprocess=args.reprocess, args_str=_args_str(args)) + + # Check if the MDS dataset is integral by checking index.json + if integrity_check(args.output_folder): + raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") + else: raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") # Run a few checks on resulted MDS datasets @@ -686,13 +691,6 @@ def create_om_cfg(FT_API_args): for batch in dataloader: total_tokens += token_counting_func(batch) - # if len(batch['input_ids']) == 0: # (check labels as well if exist): - # raise ValueError('input_ids is empty') - - # batch_tokens = batch['input_ids'] # (add 'labels' as well if exist) - # batch_token_count = sum([len(tokens) for tokens in batch_tokens]) - # total_tokens += batch_token_count - print("Total number of tokens:", total_tokens) # COMMAND ---------- From 3eac3bf1412a89983588e66916041bfb2b059dda Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 5 Jan 2024 00:42:12 -0800 Subject: [PATCH 05/63] Add unit tests --- .../data_prep/validate_and_tokenize_data.py | 140 +++++++++--------- .../test_validate_and_tokenize_data.py | 103 +++++++++++++ 2 files changed, 173 insertions(+), 70 deletions(-) create mode 100644 tests/a_scripts/data_prep/test_validate_and_tokenize_data.py diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index b81811a5fb..ae2d1129b3 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -64,7 +64,7 @@ # COMMAND ---------- -dbutils.library.restartPython() +# dbutils.library.restartPython() # COMMAND ---------- @@ -77,13 +77,14 @@ import numpy as np from omegaconf import OmegaConf as om from argparse import Namespace -from typing import Union, Tuple +from typing import Union, Tuple from llmfoundry.utils import build_tokenizer -import torch +from huggingface_hub import dataset_info +from datasets import get_dataset_split_names # COMMAND ---------- -# MAGIC %md +# MAGIC %md # MAGIC ## User Defines the Cell Below # COMMAND ---------- @@ -503,7 +504,7 @@ def _args_str(original_args: Namespace) -> str: from streaming.base.storage.upload import CloudUploader from streaming.base.storage.download import download_file -import json +import json def integrity_check(out: Union[str, Tuple[str, str]]): """Check if the index file has integrity. @@ -538,11 +539,8 @@ def count_shards(mds_root: str): merged_index = json.load(open(local_merged_index_path, 'r')) n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) return n_shard_files == actual_n_shard_files - + def check_HF_datasets(dataset_names_with_splits): - from huggingface_hub import dataset_info - from datasets import get_dataset_split_names - import os token = os.environ.get("HUGGING_FACE_HUB_TOKEN") for dataset_name_with_split in dataset_names_with_splits: dataset_name, split = os.path.split(dataset_name_with_split) @@ -565,7 +563,7 @@ def check_HF_datasets(dataset_names_with_splits): if split not in splits: return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." return True, "" - + def is_hf_dataset_path(path): """Check if a given string is a dataset path used by Hugging Face. @@ -576,7 +574,7 @@ def is_hf_dataset_path(path): bool: True if the string is a dataset path, False otherwise. """ # Regular expression to match the dataset path pattern - pattern = r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+(/[\w]+)?/?$" + pattern = r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$" return bool(re.match(pattern, path)) @@ -627,72 +625,74 @@ def create_om_cfg(FT_API_args): tokenizer_name=model, tokenizer_kwargs={'model_max_length': max_seq_len}, ) - + return cfg, tokenizer # COMMAND ---------- # build cfg from the inputs +def main(): + if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': + # check if train_data_path is a valid HF dataset url with splits. + if not is_hf_dataset_path(FT_API_args.train_data_path): + raise ValueError(f"Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.") + # load dataset.info and see if HF tokens are correctly set. + check_HF_datasets(FT_API_args.train_data_path) + + cfg, tokenizer = create_om_cfg(FT_API_args) + + elif FT_API_args.task_type == 'CONTINUED_PRETRAIN': + # check if train_data_path is a valid object store that composer supports + cfg, tokenizer = create_om_cfg(FT_API_args) + + input_folder = FT_API_args.train_data_path + output_folder = FT_API_args.save_folder + concat_tokens = FT_API_args.context_length + tokenizer_name = FT_API_args.model + + # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" + args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) + convert_text_to_mds(tokenizer_name=args.tokenizer, + output_folder=args.output_folder, + input_folder=args.input_folder, + concat_tokens=args.concat_tokens, + eos_text=args.eos_text, + bos_text=args.bos_text, + no_wrap=args.no_wrap, + compression=args.compression, + processes=args.processes, + reprocess=args.reprocess, + args_str=_args_str(args)) + + # Check if the MDS dataset is integral by checking index.json + if integrity_check(args.output_folder): + raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") + + else: + raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") + # Run a few checks on resulted MDS datasets + # 1. no shards in output_folder + # 2. check shard completeness by downloading and inspecting index.json + + + from llmfoundry.data.finetuning import build_finetuning_dataloader + tokenizer_name = 'EleutherAI/gpt-neox-20b' + tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + + device_batch_size = 1 + dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) + dataloader = dataspec.dataloader + token_counting_func = dataspec.get_num_tokens_in_batch + + total_tokens = 0 + for batch in dataloader: + total_tokens += token_counting_func(batch) -if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': - # check if train_data_path is a valid HF dataset url with splits. - if not is_hf_dataset_path(FT_API_args.train_data_path): - raise ValueError(f"Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.") - # load dataset.info and see if HF tokens are correctly set. - check_HF_datasets(FT_API_args.train_data_path) - - cfg, tokenizer = create_om_cfg(FT_API_args) - -elif FT_API_args.task_type == 'CONTINUED_PRETRAIN': - # check if train_data_path is a valid object store that composer supports - cfg, tokenizer = create_om_cfg(FT_API_args) - - input_folder = FT_API_args.train_data_path - output_folder = FT_API_args.save_folder - concat_tokens = FT_API_args.context_length - tokenizer_name = FT_API_args.model - - # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" - args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) - convert_text_to_mds(tokenizer_name=args.tokenizer, - output_folder=args.output_folder, - input_folder=args.input_folder, - concat_tokens=args.concat_tokens, - eos_text=args.eos_text, - bos_text=args.bos_text, - no_wrap=args.no_wrap, - compression=args.compression, - processes=args.processes, - reprocess=args.reprocess, - args_str=_args_str(args)) - - # Check if the MDS dataset is integral by checking index.json - if integrity_check(args.output_folder): - raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") - -else: - raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") - # Run a few checks on resulted MDS datasets - # 1. no shards in output_folder - # 2. check shard completeness by downloading and inspecting index.json - - -from llmfoundry.data.finetuning import build_finetuning_dataloader -tokenizer_name = 'EleutherAI/gpt-neox-20b' -tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} -tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - -device_batch_size = 1 -dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) -dataloader = dataspec.dataloader -token_counting_func = dataspec.get_num_tokens_in_batch - -total_tokens = 0 -for batch in dataloader: - total_tokens += token_counting_func(batch) - -print("Total number of tokens:", total_tokens) + print("Total number of tokens:", total_tokens) # COMMAND ---------- +if __name__ == '__main__': + main() diff --git a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py new file mode 100644 index 0000000000..5b3b5b561b --- /dev/null +++ b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py @@ -0,0 +1,103 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import Mock, patch, MagicMock, mock_open +from argparse import Namespace +from scripts.data_prep.validate_and_tokenize_data import integrity_check, check_HF_datasets, is_hf_dataset_path, create_om_cfg +from streaming.base.storage.upload import CloudUploader +from transformers import AutoTokenizer + +class MockCloudUploader: + def __init__(self): + self.remote = "some_remote_path" + self.local = "some_local_path" + + def list_objects(self): + return ['shard1.mds', 'shard2.mds'] + +class MockDatasetInfo: + def __init__(self): + self.id = "valid_dataset" + self.description = "A mock dataset description" + +@patch('scripts.data_prep.validate_and_tokenize_data.CloudUploader.get') +@patch('scripts.data_prep.validate_and_tokenize_data.download_file') +@patch('scripts.data_prep.validate_and_tokenize_data.json.load') +@patch('builtins.open', new_callable=mock_open, read_data='{"shards": [{"raw_data": {"basename": "shard1.mds"}}, {"raw_data": {"basename": "shard2.mds"}}]}') +def test_integrity_check(mock_file_open, mock_json_load, mock_download_file, mock_cloud_uploader): + # Setup mocks + mock_cloud_uploader.return_value = MockCloudUploader() + mock_json_load.return_value = {'shards': [{'raw_data': {'basename': 'shard1.mds'}}, {'raw_data': {'basename': 'shard2.mds'}}]} + + # Test case where integrity is valid + assert integrity_check('mock_dataset_path') + + # Test case where integrity is invalid + # Modify the mock to simulate a different scenario + mock_json_load.return_value = {'shards': [{'raw_data': {'basename': 'shard1.mds'}}]} # less shards than expected + assert not integrity_check('mock_dataset_path') + +# Additional tests can be written for cases like remote URL, file not found, etc. + + + +@patch('scripts.data_prep.validate_and_tokenize_data.dataset_info') +@patch('scripts.data_prep.validate_and_tokenize_data.get_dataset_split_names') +def test_check_HF_datasets(mock_get_splits, mock_dataset_info): + # Setup mocks + mock_get_splits.return_value = ['train', 'test'] + mock_dataset_info.return_value = MockDatasetInfo() + + # Test valid dataset with valid split + result, message = check_HF_datasets(['valid_dataset/train']) + assert result + + # Test valid dataset with invalid split + result, message = check_HF_datasets(['valid_dataset/invalid_split']) + assert not result + + # Test invalid dataset + mock_dataset_info.side_effect = Exception("Dataset not found") + result, message = check_HF_datasets(['invalid_dataset/train']) + assert not result + +# Additional tests for private datasets, token issues, etc. + + + +def test_is_hf_dataset_path(): + # Valid dataset paths + assert is_hf_dataset_path('user/dataset/train') + assert is_hf_dataset_path('user/dataset') + + # Invalid dataset paths + assert not is_hf_dataset_path('user@dataset/train') + assert not is_hf_dataset_path('just_dataset_name') + assert not is_hf_dataset_path('user/dataset/unknown_split/') + + +@patch('transformers.AutoTokenizer.from_pretrained') +def test_create_om_cfg_instruction_finetune(mock_from_pretrained): + mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) + args = Namespace( + task_type='INSTRUCTION_FINETUNE', + train_data_path='hf_dataset/train', + model='model_name', + context_length=512 + ) + cfg, tokenizer = create_om_cfg(args) + assert cfg.dataset.hf_name == 'hf_dataset/train' + assert cfg.dataset.max_seq_len == 512 + +@patch('transformers.AutoTokenizer.from_pretrained') +def test_create_om_cfg_continued_pretrain(mock_from_pretrained): + mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) + args = Namespace( + task_type='CONTINUED_PRETRAIN', + train_data_path='object_store_path', + model='model_name', + context_length=512 + ) + cfg, tokenizer = create_om_cfg(args) + assert cfg.dataset.remote == 'object_store_path' + assert cfg.dataset.max_seq_len == 512 + From d2d976775a3fa5a952c6432a1e53844508061ecd Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Sat, 6 Jan 2024 14:03:11 -0800 Subject: [PATCH 06/63] Add a printout for CPT --- scripts/data_prep/validate_and_tokenize_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index ae2d1129b3..d647060324 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -668,6 +668,8 @@ def main(): if integrity_check(args.output_folder): raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") + print("Converted data for continnued pre-training was saved in: ", args.output_folder) + else: raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") # Run a few checks on resulted MDS datasets From be25591774e87d336ce74049862ced6a831047b0 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Sat, 6 Jan 2024 14:20:00 -0800 Subject: [PATCH 07/63] update question --- scripts/data_prep/validate_and_tokenize_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index d647060324..30ba2e4456 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -56,6 +56,7 @@ # MAGIC timeout: Optional[float] = 10, # MAGIC future: Literal[False] = False, # MAGIC } +# MAGIC - What null checkings do we want to have? # MAGIC ``` # COMMAND ---------- From 4651be7de1aebc9fcfaefb1034a5c71920c9e708 Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Mon, 8 Jan 2024 05:49:17 +0000 Subject: [PATCH 08/63] Add questions --- scripts/data_prep/validate_and_tokenize_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 30ba2e4456..dfa47b946b 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -57,6 +57,8 @@ # MAGIC future: Literal[False] = False, # MAGIC } # MAGIC - What null checkings do we want to have? +# MAGIC - How to map the model to its expected eos_text / bos_text format? [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703644153357929?thread_ts=1703643155.904289&cid=C05K29T9NBF) +# MAGIC - How to automate tokenization for CPT? it is always really standard: sequence -> concat(tok(BOS), tok(sequence), tok(EOS)), and then concatenate sequences. [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703698056000399?thread_ts=1703643155.904289&cid=C05K29T9NBF) # MAGIC ``` # COMMAND ---------- From 5cd6a94dc13dde2e34cab6ece86bd1500f834597 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Sun, 7 Jan 2024 23:48:20 -0800 Subject: [PATCH 09/63] Fix lints --- .../data_prep/validate_and_tokenize_data.py | 169 ++++++++++-------- .../test_validate_and_tokenize_data.py | 98 ++++++---- 2 files changed, 160 insertions(+), 107 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index dfa47b946b..de369af59d 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -1,3 +1,6 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + # Databricks notebook source # MAGIC %md # MAGIC JIRA: https://databricks.atlassian.net/jira/software/c/projects/STR/issues/STR-141?filter=allissues @@ -73,17 +76,16 @@ import os import re -from enum import Enum -from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) -from torch.utils.data import DataLoader -from streaming import StreamingDataset -import numpy as np +from argparse import ArgumentParser, Namespace +from typing import Tuple, Union + +from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, + parse_uri) +from datasets import get_dataset_split_names +from huggingface_hub import dataset_info from omegaconf import OmegaConf as om -from argparse import Namespace -from typing import Union, Tuple + from llmfoundry.utils import build_tokenizer -from huggingface_hub import dataset_info -from datasets import get_dataset_split_names # COMMAND ---------- @@ -93,23 +95,28 @@ # COMMAND ---------- FT_API_args = Namespace( - model = 'EleutherAI/gpt-neox-20b', - train_data_path = 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', - save_folder = 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', - task_type = "INSTRUCTION_FINETUNE", - eval_data_path = None, - eval_prompts = None, - custom_weights_path = None, - training_duration = None, - learning_rate = None, - context_length = 2048, - experiment_trackers = None, - disable_credentials_check = None, + model='EleutherAI/gpt-neox-20b', + train_data_path= + 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', + save_folder= + 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', + task_type='INSTRUCTION_FINETUNE', + eval_data_path=None, + eval_prompts=None, + custom_weights_path=None, + training_duration=None, + learning_rate=None, + context_length=2048, + experiment_trackers=None, + disable_credentials_check=None, # Extra argument to add to FT API # See comment https://databricks.atlassian.net/browse/STR-141?focusedCommentId=4308948 - data_prep_config = {'data_validation': True, 'data_prep': False}, - timeout = 10, - future = False, + data_prep_config={ + 'data_validation': True, + 'data_prep': False + }, + timeout=10, + future=False, ) os.environ['HF_ASSETS_CACHE'] = '/tmp/' @@ -131,14 +138,12 @@ import logging import math -import os import tempfile -from argparse import ArgumentParser, Namespace +from argparse import Namespace from concurrent.futures import ProcessPoolExecutor from glob import glob from typing import Iterable, List, Tuple, cast -import psutil from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) from streaming import MDSWriter @@ -153,27 +158,33 @@ DONE_FILENAME = '.text_to_mds_conversion_done' -def parse_args( tokenizer, - concat_tokens, - output_folder, - input_folder, - compression = 'zstd', - bos_text = '', - eos_text = '', - no_wrap = False , - processes = 32, # min(max(psutil.cpu_count() - 2, 1), 32), - reprocess = False ) -> Namespace: - - parsed = Namespace(tokenizer = tokenizer, - concat_tokens = model_max_length, - output_folder = output_folder, - input_folder = input_folder, - eos_text = eos_text, - bos_text = bos_text, - no_wrap = no_wrap, - compression = compression, - processes = processes, - reprocess = reprocess) +def parse_args( + tokenizer: str, + concat_tokens: int, + output_folder: str, + input_folder: str, + compression: str = 'zstd', + bos_text: str = '', + eos_text: str = '', + no_wrap: bool = False, + processes: int = 32, # min(max(psutil.cpu_count() - 2, 1), 32), + reprocess: bool = False +) -> Namespace: + + parser = ArgumentParser( + description= + 'Convert text files into MDS format, optionally concatenating and tokenizing', + ) + parsed = Namespace(tokenizer=tokenizer, + concat_tokens=concat_tokens, + output_folder=output_folder, + input_folder=input_folder, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + compression=compression, + processes=processes, + reprocess=reprocess) # Make sure we have needed concat options if (parsed.concat_tokens is not None and @@ -505,10 +516,12 @@ def _args_str(original_args: Namespace) -> str: # COMMAND ---------- -from streaming.base.storage.upload import CloudUploader -from streaming.base.storage.download import download_file import json +from streaming.base.storage.download import download_file +from streaming.base.storage.upload import CloudUploader + + def integrity_check(out: Union[str, Tuple[str, str]]): """Check if the index file has integrity. @@ -540,11 +553,13 @@ def count_shards(mds_root: str): actual_n_shard_files = count_shards(cu.local) merged_index = json.load(open(local_merged_index_path, 'r')) - n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) + n_shard_files = len( + {b['raw_data']['basename'] for b in merged_index['shards']}) return n_shard_files == actual_n_shard_files -def check_HF_datasets(dataset_names_with_splits): - token = os.environ.get("HUGGING_FACE_HUB_TOKEN") + +def check_HF_datasets(dataset_names_with_splits: list): + token = os.environ.get('HUGGING_FACE_HUB_TOKEN') for dataset_name_with_split in dataset_names_with_splits: dataset_name, split = os.path.split(dataset_name_with_split) # make sure we have a dataset and split @@ -552,22 +567,23 @@ def check_HF_datasets(dataset_names_with_splits): return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." # check user access to the dataset try: - info = dataset_info(dataset_name) + _ = dataset_info(dataset_name) except: - token_warning = "" + token_warning = '' if not token: - token_warning = " If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf." + token_warning = ' If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf.' return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning # check that split exists try: splits = get_dataset_split_names(dataset_name) except: # error raised in the case of multiple subsets - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets." + return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets.' if split not in splits: - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." - return True, "" + return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found.' + return True, '' -def is_hf_dataset_path(path): + +def is_hf_dataset_path(path: str): """Check if a given string is a dataset path used by Hugging Face. Args: @@ -577,11 +593,12 @@ def is_hf_dataset_path(path): bool: True if the string is a dataset path, False otherwise. """ # Regular expression to match the dataset path pattern - pattern = r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$" + pattern = r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$' return bool(re.match(pattern, path)) -def create_om_cfg(FT_API_args): + +def create_om_cfg(FT_API_args: Namespace): task_type = FT_API_args.task_type train_data_path = FT_API_args.train_data_path model = FT_API_args.model @@ -631,14 +648,18 @@ def create_om_cfg(FT_API_args): return cfg, tokenizer + # COMMAND ---------- + # build cfg from the inputs def main(): if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': # check if train_data_path is a valid HF dataset url with splits. if not is_hf_dataset_path(FT_API_args.train_data_path): - raise ValueError(f"Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.") + raise ValueError( + f'Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.' + ) # load dataset.info and see if HF tokens are correctly set. check_HF_datasets(FT_API_args.train_data_path) @@ -669,16 +690,20 @@ def main(): # Check if the MDS dataset is integral by checking index.json if integrity_check(args.output_folder): - raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") + raise RuntimeError( + f'{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!' + ) - print("Converted data for continnued pre-training was saved in: ", args.output_folder) + print('Converted data for continnued pre-training was saved in: ', + args.output_folder) else: - raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") - # Run a few checks on resulted MDS datasets - # 1. no shards in output_folder - # 2. check shard completeness by downloading and inspecting index.json - + raise ValueError( + f'task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!' + ) + # Run a few checks on resulted MDS datasets + # 1. no shards in output_folder + # 2. check shard completeness by downloading and inspecting index.json from llmfoundry.data.finetuning import build_finetuning_dataloader tokenizer_name = 'EleutherAI/gpt-neox-20b' @@ -694,10 +719,10 @@ def main(): for batch in dataloader: total_tokens += token_counting_func(batch) - print("Total number of tokens:", total_tokens) + print('Total number of tokens:', total_tokens) -# COMMAND ---------- +# COMMAND ---------- if __name__ == '__main__': main() diff --git a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py index 5b3b5b561b..8a78581fef 100644 --- a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py +++ b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py @@ -1,67 +1,99 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import Mock, patch, MagicMock, mock_open from argparse import Namespace -from scripts.data_prep.validate_and_tokenize_data import integrity_check, check_HF_datasets, is_hf_dataset_path, create_om_cfg -from streaming.base.storage.upload import CloudUploader +from typing import Any +from unittest.mock import MagicMock, mock_open, patch + from transformers import AutoTokenizer +from scripts.data_prep.validate_and_tokenize_data import (check_HF_datasets, + create_om_cfg, + integrity_check, + is_hf_dataset_path) + + class MockCloudUploader: + def __init__(self): - self.remote = "some_remote_path" - self.local = "some_local_path" + self.remote = 'some_remote_path' + self.local = 'some_local_path' def list_objects(self): return ['shard1.mds', 'shard2.mds'] + class MockDatasetInfo: + def __init__(self): - self.id = "valid_dataset" - self.description = "A mock dataset description" + self.id = 'valid_dataset' + self.description = 'A mock dataset description' + @patch('scripts.data_prep.validate_and_tokenize_data.CloudUploader.get') @patch('scripts.data_prep.validate_and_tokenize_data.download_file') @patch('scripts.data_prep.validate_and_tokenize_data.json.load') -@patch('builtins.open', new_callable=mock_open, read_data='{"shards": [{"raw_data": {"basename": "shard1.mds"}}, {"raw_data": {"basename": "shard2.mds"}}]}') -def test_integrity_check(mock_file_open, mock_json_load, mock_download_file, mock_cloud_uploader): +@patch( + 'builtins.open', + new_callable=mock_open, + read_data= + '{"shards": [{"raw_data": {"basename": "shard1.mds"}}, {"raw_data": {"basename": "shard2.mds"}}]}' +) +def test_integrity_check(mock_file_open: Any, mock_json_load: Any, + mock_download_file: Any, mock_cloud_uploader: Any): # Setup mocks mock_cloud_uploader.return_value = MockCloudUploader() - mock_json_load.return_value = {'shards': [{'raw_data': {'basename': 'shard1.mds'}}, {'raw_data': {'basename': 'shard2.mds'}}]} + mock_json_load.return_value = { + 'shards': [{ + 'raw_data': { + 'basename': 'shard1.mds' + } + }, { + 'raw_data': { + 'basename': 'shard2.mds' + } + }] + } # Test case where integrity is valid assert integrity_check('mock_dataset_path') # Test case where integrity is invalid # Modify the mock to simulate a different scenario - mock_json_load.return_value = {'shards': [{'raw_data': {'basename': 'shard1.mds'}}]} # less shards than expected + mock_json_load.return_value = { + 'shards': [{ + 'raw_data': { + 'basename': 'shard1.mds' + } + }] + } # less shards than expected assert not integrity_check('mock_dataset_path') -# Additional tests can be written for cases like remote URL, file not found, etc. +# Additional tests can be written for cases like remote URL, file not found, etc. @patch('scripts.data_prep.validate_and_tokenize_data.dataset_info') @patch('scripts.data_prep.validate_and_tokenize_data.get_dataset_split_names') -def test_check_HF_datasets(mock_get_splits, mock_dataset_info): +def test_check_HF_datasets(mock_get_splits: Any, mock_dataset_info: Any): # Setup mocks mock_get_splits.return_value = ['train', 'test'] mock_dataset_info.return_value = MockDatasetInfo() # Test valid dataset with valid split - result, message = check_HF_datasets(['valid_dataset/train']) + result, _ = check_HF_datasets(['valid_dataset/train']) assert result # Test valid dataset with invalid split - result, message = check_HF_datasets(['valid_dataset/invalid_split']) + result, _ = check_HF_datasets(['valid_dataset/invalid_split']) assert not result # Test invalid dataset - mock_dataset_info.side_effect = Exception("Dataset not found") - result, message = check_HF_datasets(['invalid_dataset/train']) + mock_dataset_info.side_effect = Exception('Dataset not found') + result, _ = check_HF_datasets(['invalid_dataset/train']) assert not result -# Additional tests for private datasets, token issues, etc. +# Additional tests for private datasets, token issues, etc. def test_is_hf_dataset_path(): @@ -76,28 +108,24 @@ def test_is_hf_dataset_path(): @patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_instruction_finetune(mock_from_pretrained): +def test_create_om_cfg_instruction_finetune(mock_from_pretrained: Any): mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace( - task_type='INSTRUCTION_FINETUNE', - train_data_path='hf_dataset/train', - model='model_name', - context_length=512 - ) - cfg, tokenizer = create_om_cfg(args) + args = Namespace(task_type='INSTRUCTION_FINETUNE', + train_data_path='hf_dataset/train', + model='model_name', + context_length=512) + cfg, _ = create_om_cfg(args) assert cfg.dataset.hf_name == 'hf_dataset/train' assert cfg.dataset.max_seq_len == 512 + @patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_continued_pretrain(mock_from_pretrained): +def test_create_om_cfg_continued_pretrain(mock_from_pretrained: Any): mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace( - task_type='CONTINUED_PRETRAIN', - train_data_path='object_store_path', - model='model_name', - context_length=512 - ) - cfg, tokenizer = create_om_cfg(args) + args = Namespace(task_type='CONTINUED_PRETRAIN', + train_data_path='object_store_path', + model='model_name', + context_length=512) + cfg, _ = create_om_cfg(args) assert cfg.dataset.remote == 'object_store_path' assert cfg.dataset.max_seq_len == 512 - From e6e4a81f51974b63374c3d7debeb4441249c39f0 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Mon, 8 Jan 2024 11:21:49 -0800 Subject: [PATCH 10/63] update format --- scripts/data_prep/validate_and_tokenize_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index de369af59d..9772f7662c 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -1,7 +1,8 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -# Databricks notebook source +# COMMAND ---------- + # MAGIC %md # MAGIC JIRA: https://databricks.atlassian.net/jira/software/c/projects/STR/issues/STR-141?filter=allissues From 1668b9a4535b2e721053b21bd88c80f5a2d011c2 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Mon, 8 Jan 2024 11:27:25 -0800 Subject: [PATCH 11/63] update --- scripts/data_prep/validate_and_tokenize_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 9772f7662c..5f222b5466 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -1,5 +1,7 @@ -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 +# Databricks notebook source +# MAGIC %md +# MAGIC Copyright 2022 MosaicML LLM Foundry authors +# MAGIC SPDX-License-Identifier: Apache-2.0 # COMMAND ---------- From 22191350f39ef99467d58269f425b173e29588e9 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Mon, 8 Jan 2024 11:29:38 -0800 Subject: [PATCH 12/63] nb source --- scripts/data_prep/validate_and_tokenize_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 5f222b5466..3b6c109199 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -1,6 +1,6 @@ # Databricks notebook source # MAGIC %md -# MAGIC Copyright 2022 MosaicML LLM Foundry authors +# MAGIC Copyright 2022 MosaicML LLM Foundry authors. # MAGIC SPDX-License-Identifier: Apache-2.0 # COMMAND ---------- From 86c6e87acc34f5bbc93d61e65754c940952d1907 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 22 Dec 2023 22:18:04 -0800 Subject: [PATCH 13/63] add validation script --- .../data_prep/validate_and_tokenize_data.py | 804 ++++++++++++++++++ 1 file changed, 804 insertions(+) create mode 100644 scripts/data_prep/validate_and_tokenize_data.py diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py new file mode 100644 index 0000000000..68b1211f85 --- /dev/null +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -0,0 +1,804 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC JIRA: https://databricks.atlassian.net/jira/software/c/projects/STR/issues/STR-141?filter=allissues + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Warning: Important Alert Regarding the Script Usage +# MAGIC +# MAGIC ### Script Purpose: +# MAGIC - **Not for Training**: This script is not utilized during the training process. +# MAGIC - **Ad-Hoc Validation**: It serves as an ad-hoc utility for users to run independently prior to starting fine-tuning. +# MAGIC - **Data Verification**: Its primary function is to validate the user's data before they invoke the Fine-Tuning (FT) API. +# MAGIC - **Cost Estimation**: Users can estimate the cost implications with this script. +# MAGIC +# MAGIC ### Usage Scenario: +# MAGIC This script is particularly useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process. +# MAGIC +# MAGIC ### Note on Long-Term Solution: +# MAGIC - **Temporary Measure**: This script is a stop-gap solution. +# MAGIC - **Future Development**: We are in the process of developing a long-term data preparation service, which will eventually replace this script. +# MAGIC +# MAGIC ### Checks Include: +# MAGIC - check input dataset: +# MAGIC 1) verify if dataset input format is valid (need to be one of these: Huggingface, delta table, dbfs:/Volumes, cloud path); +# MAGIC - check HF input location: +# MAGIC 1) load dataset info and check if it is accessible; +# MAGIC 2) verify if the split exists. +# MAGIC - check cloud path location: +# MAGIC 1) check the cloud prefix is compliant with composers' object store supports (gs, s3, oci) +# MAGIC 2) check if list objects returns nothing. +# MAGIC - count_tokens: +# MAGIC 1) For IFT task: validate tokenization by running tokenizer + filter on the entire dataset. count the number of tokens. Throws error if there are any empty responses or prompts +# MAGIC 2) For CPT task: call donwload_text_to_mds.py and count the resulted mds dataset. Note this could take a long time. +# MAGIC +# MAGIC ### Questions: +# MAGIC - Is "download_text_to_mds.py" always callable from the validation script? +# MAGIC - what is the function to reuse to run tokenization on HF datasets with filters? +# MAGIC - The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption? +# MAGIC ``` +# MAGIC cfg = { +# MAGIC model: str, +# MAGIC train_data_path: str, +# MAGIC save_folder: str, +# MAGIC *, +# MAGIC task_type: Optional[str] = "INSTRUCTION_FINETUNE", +# MAGIC eval_data_path: Optional[str] = None, +# MAGIC eval_prompts: Optional[List[str]] = None, +# MAGIC custom_weights_path: Optional[str] = None, +# MAGIC training_duration: Optional[str] = None, +# MAGIC learning_rate: Optional[float] = None, +# MAGIC context_length: Optional[int] = None, +# MAGIC experiment_trackers: Optional[List[Dict]] = None, +# MAGIC data_prep_config: Optional[Dict] = None, +# MAGIC disable_credentials_check: Optional[bool] = None, +# MAGIC timeout: Optional[float] = 10, +# MAGIC future: Literal[False] = False, +# MAGIC } +# MAGIC ``` + +# COMMAND ---------- + +# MAGIC %pip install llm-foundry + +# COMMAND ---------- + +dbutils.library.restartPython() + +# COMMAND ---------- + +import os +import re +from enum import Enum +from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) +from torch.utils.data import DataLoader +from streaming import StreamingDataset +import numpy as np +from omegaconf import OmegaConf as om + +# COMMAND ---------- + +FT_API_args = Namespace( + model = 'EleutherAI/gpt-neox-20b', + train_data_path: str, + save_folder: str, + task_type: Optional[str] = "INSTRUCTION_FINETUNE", + eval_data_path = None, + eval_prompts = None, + custom_weights_path = None, + training_duration = None, + learning_rate = None, + context_length = None, + experiment_trackers = None, + disable_credentials_check = None, + # Extra argument to add to FT API + # See comment https://databricks.atlassian.net/browse/STR-141?focusedCommentId=4308948 + data_prep_config = {'data_validation': True, 'data_prep': False}, + timeout = 10, + future = False, +) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Utility Functions + +# COMMAND ---------- + +def check_HF_datasets(dataset_names_with_splits): + from huggingface_hub import dataset_info + from datasets import get_dataset_split_names + import os + token = os.environ.get("HUGGING_FACE_HUB_TOKEN") + for dataset_name_with_split in dataset_names_with_splits: + dataset_name, split = os.path.split(dataset_name_with_split) + # make sure we have a dataset and split + if not dataset_name or not split: + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + # check user access to the dataset + try: + info = dataset_info(dataset_name) + except: + token_warning = "" + if not token: + token_warning = " If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf." + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning + # check that split exists + try: + splits = get_dataset_split_names(dataset_name) + except: # error raised in the case of multiple subsets + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets." + if split not in splits: + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." + return True, "" + +# COMMAND ---------- + +def integrity_check(out: Union[str, Tuple[str, str]]): + """Check if the index file has integrity. + + If index is a cloud url, first download it to a temp local file. + + Args: + out (Union[str, Tuple[str,str]]): MDS dataset path + """ + + def get_expected(mds_root: str): + n_shard_files = 0 + cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) + for o in cu.list_objects(): + if o.endswith('.mds'): + n_shard_files += 1 + return n_shard_files + + cu = CloudUploader.get(out, keep_local=True, exist_ok=True) + + with tempfile.TemporaryDirectory() as temp_dir: + if cu.remote: + download_file(os.path.join(cu.remote, 'index.json'), + os.path.join(temp_dir, 'index.json'), + timeout=60) + expected_n_shard_files = get_expected(cu.remote) + local_merged_index_path = os.path.join(temp_dir, 'index.json') + else: + local_merged_index_path = os.path.join(cu.local, 'index.json') + expected_n_shard_files = get_expected(cu.local) + + merged_index = json.load(open(local_merged_index_path, 'r')) + n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) + assert n_shard_files == expected_n_shard_files, f'expected {expected_n_shard_files} shard files but got {n_shard_files}' + +# COMMAND ---------- + +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +# Taken from llmfoundry/scripts/data_prep/convert_text_to_mds.py + +import logging +import math +import os +import tempfile +from argparse import ArgumentParser, Namespace +from concurrent.futures import ProcessPoolExecutor +from glob import glob +from typing import Iterable, List, Tuple, cast + +import psutil +from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, + parse_uri) +from streaming import MDSWriter +from tqdm import tqdm +from transformers import AutoTokenizer + +from llmfoundry.data import ConcatTokensDataset +from llmfoundry.utils.data_prep_utils import (DownloadingIterable, + merge_shard_groups) + +log = logging.getLogger(__name__) +DONE_FILENAME = '.text_to_mds_conversion_done' + + +def parse_args( tokenizer, + concat_tokens, + output_folder, + input_folder, + compression = 'zstd', + bos_text = '', + eos_text = '', + no_wrap = False , + processes = 32, # min(max(psutil.cpu_count() - 2, 1), 32), + reprocess = False ) -> Namespace: + + parsed = Namespace(tokenizer = tokenizer, + concat_tokens = model_max_length, + output_folder = output_folder, + input_folder = input_folder, + eos_text = eos_text, + bos_text = bos_text, + no_wrap = no_wrap, + compression = compression, + processes = processes, + reprocess = reprocess) + + # Make sure we have needed concat options + if (parsed.concat_tokens is not None and + isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None): + parser.error( + 'When setting --concat_tokens, you must specify a --tokenizer') + + # now that we have validated them, change BOS/EOS to strings + if parsed.bos_text is None: + parsed.bos_text = '' + if parsed.eos_text is None: + parsed.eos_text = '' + return parsed + + +def get_object_names(input_folder: str) -> List[str]: + """Get object names from a local or remote folder. + + Args: + input_folder (str): local or remote folder path. + """ + object_store = maybe_create_object_store_from_uri(input_folder) + if object_store is not None: + _, _, folder_prefix = parse_uri(input_folder) + names = [ + name for name in object_store.list_objects(folder_prefix) + if name.endswith('.txt') + ] + else: + # input_folder is a local folder + names = [ + text_file for dirpath, _, _ in os.walk(input_folder) + for text_file in glob(os.path.join(dirpath, '*.txt')) + ] + # return names, sizes + log.info(f'Found {len(names)} text files at {input_folder}') + + return names + + +def get_task_args( + object_names: List[str], + output_root: str, + input_folder: str, + n_groups: int, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, +) -> Iterable: + """Get download_and_convert arguments split across n_groups. + + Each group handles a portion of object_names. + + Args: + object_names (List[str]): Names of objects to process + output_root (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + n_groups (int): Number of groups to split the object names into + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concantenate up to this many tokens + eos_text (str): Textend to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + """ + num_objects = len(object_names) + objs_per_group = math.ceil(num_objects / n_groups) + for group, i in enumerate(range(0, num_objects, objs_per_group)): + output_subdir = os.path.join(output_root, str(group)) + yield ( + object_names[i:min(i + objs_per_group, num_objects)], + output_subdir, + input_folder, + tokenizer_name, + concat_tokens, + eos_text, + bos_text, + no_wrap, + compression, + ) + + +def download_and_convert_starargs(args: Tuple): + """Helper function to call download_and_convert with star args. + + This helps us use download_and_convert with mutiprocessing. + """ + return download_and_convert(*args) + + +def download_and_convert( + file_names: List[str], + output_folder: str, + input_folder: str, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, +): + """Downloads and converts text fies to MDS format. + + Args: + file_names (List[str]): Files to process + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concantenate up to this many tokens + eos_text (str): Textend to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + """ + object_store = maybe_create_object_store_from_uri(input_folder) + + # Download file_names + with tempfile.TemporaryDirectory() as tmp_dir: + downloading_iter = DownloadingIterable(object_names=file_names, + output_folder=tmp_dir, + object_store=object_store) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace + + # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up + # to the maximum sequence length + dataset = ConcatTokensDataset( + hf_dataset=downloading_iter, + max_length=concat_tokens, + tokenizer=tokenizer, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + ) + + columns = {'tokens': 'bytes'} + + log.info('Converting to MDS format...') + with MDSWriter(out=output_folder, + columns=columns, + compression=compression) as out: + for sample in tqdm(dataset): + out.write(sample) + + +def is_remote_path(path: str) -> bool: + """Checks whether a path is a remote path. + + Args: + path (str): path to check + """ + backend, _, _ = parse_uri(path) + return backend != '' + + +def is_already_processed(output_root: str, args_str: str, + object_names: List[str]) -> bool: + """Determines whether a group of text files has already been processed. + + Checks the done fie at output root to determine this. + + Args: + output_root (str): Output folder where a done file may exist + args_str (str): String representation of the arguments + object_names (List[str]): Names of objects to convert to MDS format + """ + # Retrieve the done file contents + output_object_store = maybe_create_object_store_from_uri(output_root) + if output_object_store is not None: + # Download and read the done file from the remote object store + _, _, output_folder_prefix = parse_uri(output_root) + try: + with tempfile.TemporaryDirectory() as tmp_dir: + done_file = os.path.join(tmp_dir, DONE_FILENAME) + output_object_store.download_object( + os.path.join(output_folder_prefix, DONE_FILENAME), + done_file) + with open(done_file) as df: + done_file_contents = df.read().splitlines() + except FileNotFoundError: + return False + else: + # Read the local done file + done_file = os.path.join(output_root, DONE_FILENAME) + if not os.path.isfile(done_file): + return False + with open(done_file) as df: + done_file_contents = df.read().splitlines() + # Compare the arguments + prev_args_str = done_file_contents[0] + if prev_args_str != args_str: + return False + + # Compare file names + prev_names = done_file_contents[1:] + if len(prev_names) != len(object_names): + return False + for idx, prev_name in enumerate(prev_names): + if object_names[idx] != prev_name: + return False + return True + + +def write_done_file(folder: str, args_str: str, object_names: List[str]): + """Write a file to signify completion. + + This the done file includes the arguments to processing and + a list of objects that were processed. + + Args: + folder (str): Folder to write the done file to + args_str (str): String representation of arguments + object_names (List[str]): List of objects to convert to MDS format + """ + with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: + done_file.write('\n'.join([args_str] + object_names) + '\n') + + +def convert_text_to_mds( + tokenizer_name: str, + output_folder: str, + input_folder: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, + processes: int, + args_str: str, + reprocess: bool, +): + """Convert a folder of text files to MDS format. + + Args: + tokenizer_name (str): Name of tokenizer to use + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + concat_tokens (int): Concantenate up to this many tokens + eos_text (str): Textend to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + processes (int): The number of processes to use. + args_str (str): String representation of the arguments + reprocess (bool): Whether to always reprocess the given folder of text files + """ + is_remote_output = is_remote_path(output_folder) + + object_names = get_object_names(input_folder) + if len(object_names) == 0: + raise ValueError(f'No text files were found at {input_folder}.') + + # Check if the text files in the bucket have already been processed. + if not reprocess and is_already_processed(output_folder, args_str, + object_names): + log.info( + f'Input folder {input_folder} is already processed at {output_folder} and ' + + + 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.' + ) + return + + # Use a temporary local directory if the output is remote and there are more than 1 processes + local_output_folder = tempfile.TemporaryDirectory( + ).name if is_remote_output else output_folder + + if processes > 1: + # Download and convert the text files in parallel + args = get_task_args(object_names, local_output_folder, input_folder, + processes, tokenizer_name, concat_tokens, eos_text, + bos_text, no_wrap, compression) + with ProcessPoolExecutor(max_workers=processes) as executor: + list(executor.map(download_and_convert_starargs, args)) + + # Merge the mds shards from each of the processes into a single folder + merge_shard_groups(local_output_folder) + else: + download_and_convert(object_names, local_output_folder, input_folder, + tokenizer_name, concat_tokens, eos_text, bos_text, + no_wrap, compression) + + # Write a done file with the args and object names + write_done_file(local_output_folder, args_str, object_names) + + if is_remote_output: + # Upload the local output to the remote location + output_object_store = cast( + ObjectStore, maybe_create_object_store_from_uri(output_folder)) + _, _, output_folder_prefix = parse_uri(output_folder) + files_to_upload = os.listdir(local_output_folder) + + for file in files_to_upload: + assert not os.path.isdir(file) + remote_path = os.path.join(output_folder_prefix, file) + output_object_store.upload_object( + remote_path, os.path.join(local_output_folder, file)) + + +def _args_str(original_args: Namespace) -> str: + """Create a string from the args to determine whether to reprocess. + + Args: + original_args (Namespace): Arguments to main function. + """ + # Take the arguments that influence the final result. + # reprocess and max_mds_writer_workers are not taken. + args = Namespace( + tokenizer_name=original_args.tokenizer, + output_folder=original_args.output_folder, + input_folder=original_args.input_folder, + concat_tokens=original_args.concat_tokens, + eos_text=original_args.eos_text, + bos_text=original_args.bos_text, + no_wrap=original_args.no_wrap, + compression=original_args.compression, + processes=original_args.processes, + ) + + return str(args) + + +# COMMAND ---------- + +def is_hf_dataset_path(path): + """Check if a given string is a dataset path used by Hugging Face. + + Args: + path (str): The string to be checked. + + Returns: + bool: True if the string is a dataset path, False otherwise. + """ + # Regular expression to match the dataset path pattern + pattern = r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+(/[\w]+)?/?$" + + return bool(re.match(pattern, path)) + + +def create_om_cfg(FT_API_args): + task_type = FT_API_args.task_type + train_data_path = FT_API_args.train_data_path + model = FT_API_args.model + max_seq_len = FT_API_args.context_length + + common_args = { + 'drop_last': False, + 'num_workers': 0, + 'prefetch_factor': None, + 'pin_memory': False, + 'persistent_workers': False, + 'timeout': 0 + } + if task == 'INSTRUCTION_FINETUNE': + cfg = om.create({ + 'dataset': { + 'hf_name': train_data_path, + 'split': 'train', + 'max_seq_len': max_seq_len, + 'decoder_only_format': True, + 'allow_pad_trimming': False, + 'shuffle': True, + }, + **common_args + }) + + else: + cfg = om.create({ + 'name': 'finetuning', + 'dataset': { + 'remote': train_data_path, + 'local': train_data_path, + 'split': 'train', + 'max_seq_len': max_seq_len, + 'decoder_only_format': True, + 'allow_pad_trimming': False, + 'packing_ratio': None, + 'shuffle': True, + }, + **common_args + }) + + tokenizer = build_tokenizer( + tokenizer_name=model, + tokenizer_kwargs={'model_max_length': max_seq_len}, + ) + + return cfg, tokenizer + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Validate and token Count + +# COMMAND ---------- + +if task_type == 'INSTRUCTION_FINETUNE': + # check if train_data_path is a valid HF dataset url with splits. + # load dataset.info and see if HF tokens are correctly set. + check_HF_datasets() + +elif task_type == 'CONTINUED_PRETRAIN': + # check if train_data_path is a valid object store that composer supports + + # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" + args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) + convert_text_to_mds(tokenizer_name=args.tokenizer, + output_folder=args.output_folder, + input_folder=args.input_folder, + concat_tokens=args.concat_tokens, + eos_text=args.eos_text, + bos_text=args.bos_text, + no_wrap=args.no_wrap, + compression=args.compression, + processes=args.processes, + reprocess=args.reprocess, + args_str=_args_str(args)) + +else: + raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {task_type} instead!") + # Run a few checks on resulted MDS datasets + # 1. no shards in output_folder + # 2. check shard completeness by downloading and inspecting index.json + +import torch +from omegaconf import OmegaConf as om +from llmfoundry.utils import build_tokenizer + +# build cfg from the inputs + +from llmfoundry.data.finetuning import build_finetuning_dataloader +tokenizer_name = 'EleutherAI/gpt-neox-20b' +tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} +tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + +device_batch_size = 1 +dataloader = build_finetuning_dataloader(cfg, tokenizer, + device_batch_size).dataloader + +total_tokens = 0 +for batch in dataloader: + if len(batch['input_ids']) == 0 (check labels as well if exist): + raise Error + + batch_tokens = batch['input_ids'] (add 'labels' as well if exist) + batch_token_count = sum(len(tokens) for tokens in batch_tokens) + total_tokens += batch_token_count + +print("Total number of tokens:", total_tokens) + +# COMMAND ---------- + + + +# COMMAND ---------- + + + +# COMMAND ---------- + + + +# COMMAND ---------- + + + +# COMMAND ---------- + + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # DEPRECATED BELOW + +# COMMAND ---------- + +# If running on databricks notebook, the url can only be a Volume path. +# Make sure this is compliant to https://github.com/mosaicml/llm-foundry/blob/1191267195367b5ec6093ed7854b8f6daf1be2d3/llmfoundry/data/text_data.py#L174-L178 + +# raw dataset location you will point FT API to. +# It can be a local path or a remote path (s3/gcs/oci/dbfs:Volume) +dataset_url = 'tatsu-lab/alpaca' # "s3://xxxxx" or "HF name" +preprocessing_fn = 'llmfoundry.data.finetuning.tasks:alpaca_preprocessing_function' + +# dataset schema with tokens +tokenized_table_schema = {'tokens': bytes, 'id': np.int64} +tokenizer = 'EleutherAI/gpt-neox-20b' +tokenizer_kwargs = {'model_max_length': 2048} + +output_folder ='/Volumes/main/mosaic_hackathon/managed-volume/output' +input_folder = '' +eos_text = '<|endoftext|>' + + +# COMMAND ---------- + +if not dataset_url: + raise ValueError("dataset_url needs to be set at this point!") + +# COMMAND ---------- + + + +# COMMAND ---------- + +def check_cloud_datasets(dataset_url, job): + + suffix = '.txt' if job==Job.CPT else '.jsonl' + + object_store = maybe_create_object_store_from_uri(dataset_url) + + if object_store is not None: + _, _, folder_prefix = parse_uri(dataset_url) + names = [ + name for name in object_store.list_objects(folder_prefix) + if name.endswith(suffix) + ] + else: + # input_folder is a local folder + names = [ + text_file for dirpath, _, _ in os.walk(input_folder) + for text_file in glob(os.path.join(dirpath, '*.' + suffix)) + ] + assert len(names) > 0, f"No {suffix} files found in {dataset_url}." + return names + +check_cloud_datasets(dataset_url, job) + + +# COMMAND ---------- + +def validate_and_count_tokens(dataset_url, cfg, job): + if job == Job.IFT: + # for IFT, basic data processing to see (1) well-formed JSONL and (2) strip of empty tokens + import torch + from omegaconf import OmegaConf as om + from llmfoundry.utils import build_tokenizer + + tokenizer_name = 'EleutherAI/gpt-neox-20b' + tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + device_batch_size = 2 + dataloader = build_finetuning_dataloader(cfg, tokenizer, + device_batch_size).dataloader + + packing = cfg.dataset.get('packing_ratio') is not None + + for i, batch in enumerate(dataloader): + if i >= 5: + break + print(f'-----Batch {i}-----') + for k, v in batch.items(): + if isinstance(v, torch.Tensor): + print(k, v.shape) + else: + print(k, v) + else: # job == Job.CPT: + # for CPT, strip empty txt files + print("Make sure the script is running within llmfoundry") + convert_text_to_mds(tokenizer = tokenizer , + concat_tokens = tokenizer_kwargs['model_max_length'], + output_folder = output_folder, + input_folder = input_folder, + eos_text = '<|endoftext|>') + + dataset=StreamingDataset(local='/Volumes/datasets/default/byod/cpt_poc/output/') # output has the streaming shards + dataloader = DataLoader(dataset) + sample = next(iter(dataloader)) + b = np.asarray(sample['tokens']).tobytes() + token_ids = np.frombuffer(b, dtype=np.int64) + n_token_per_sample = len(token_ids) + print('total_tokens = ', n_token_per_sample * dataset.num_samples) + +validate_and_count_tokens() # print overall stats of dataset + +# COMMAND ---------- + + From 678b3766b0a3db877189cd15142a340df8faeec5 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Tue, 2 Jan 2024 22:09:30 -0800 Subject: [PATCH 14/63] update --- .../data_prep/validate_and_tokenize_data.py | 244 +++++------------- 1 file changed, 66 insertions(+), 178 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 68b1211f85..9dbb78de3b 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -39,22 +39,22 @@ # MAGIC - The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption? # MAGIC ``` # MAGIC cfg = { -# MAGIC model: str, -# MAGIC train_data_path: str, -# MAGIC save_folder: str, -# MAGIC *, -# MAGIC task_type: Optional[str] = "INSTRUCTION_FINETUNE", -# MAGIC eval_data_path: Optional[str] = None, -# MAGIC eval_prompts: Optional[List[str]] = None, -# MAGIC custom_weights_path: Optional[str] = None, -# MAGIC training_duration: Optional[str] = None, -# MAGIC learning_rate: Optional[float] = None, -# MAGIC context_length: Optional[int] = None, -# MAGIC experiment_trackers: Optional[List[Dict]] = None, -# MAGIC data_prep_config: Optional[Dict] = None, -# MAGIC disable_credentials_check: Optional[bool] = None, -# MAGIC timeout: Optional[float] = 10, -# MAGIC future: Literal[False] = False, +# MAGIC model: str, +# MAGIC train_data_path: str, +# MAGIC save_folder: str, +# MAGIC *, +# MAGIC task_type: Optional[str] = "INSTRUCTION_FINETUNE", +# MAGIC eval_data_path: Optional[str] = None, +# MAGIC eval_prompts: Optional[List[str]] = None, +# MAGIC custom_weights_path: Optional[str] = None, +# MAGIC training_duration: Optional[str] = None, +# MAGIC learning_rate: Optional[float] = None, +# MAGIC context_length: Optional[int] = None, +# MAGIC experiment_trackers: Optional[List[Dict]] = None, +# MAGIC data_prep_config: Optional[Dict] = None, +# MAGIC disable_credentials_check: Optional[bool] = None, +# MAGIC timeout: Optional[float] = 10, +# MAGIC future: Literal[False] = False, # MAGIC } # MAGIC ``` @@ -76,20 +76,24 @@ from streaming import StreamingDataset import numpy as np from omegaconf import OmegaConf as om +from argparse import Namespace +from typing import Union, Tuple +from llmfoundry.utils import build_tokenizer +import torch # COMMAND ---------- FT_API_args = Namespace( model = 'EleutherAI/gpt-neox-20b', - train_data_path: str, - save_folder: str, - task_type: Optional[str] = "INSTRUCTION_FINETUNE", + train_data_path = 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', + save_folder = 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', + task_type = "INSTRUCTION_FINETUNE", eval_data_path = None, eval_prompts = None, custom_weights_path = None, training_duration = None, learning_rate = None, - context_length = None, + context_length = 2048, experiment_trackers = None, disable_credentials_check = None, # Extra argument to add to FT API @@ -562,6 +566,21 @@ def is_hf_dataset_path(path): return bool(re.match(pattern, path)) + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Validate and token Count + +# COMMAND ---------- + +os.environ['HF_ASSETS_CACHE'] = '/tmp/' +os.environ['HF_HOME'] = '/tmp/' +os.environ['HF_HUB_CACHE'] = '/tmp/' +os.environ['HF_DATASETS_CACHE'] = '/tmp/' + + def create_om_cfg(FT_API_args): task_type = FT_API_args.task_type train_data_path = FT_API_args.train_data_path @@ -570,13 +589,13 @@ def create_om_cfg(FT_API_args): common_args = { 'drop_last': False, - 'num_workers': 0, - 'prefetch_factor': None, + 'num_workers': 2, + 'prefetch_factor': 2, 'pin_memory': False, 'persistent_workers': False, 'timeout': 0 } - if task == 'INSTRUCTION_FINETUNE': + if task_type == 'INSTRUCTION_FINETUNE': cfg = om.create({ 'dataset': { 'hf_name': train_data_path, @@ -609,25 +628,30 @@ def create_om_cfg(FT_API_args): tokenizer_name=model, tokenizer_kwargs={'model_max_length': max_seq_len}, ) - + return cfg, tokenizer # COMMAND ---------- -# MAGIC %md -# MAGIC ## Validate and token Count - -# COMMAND ---------- - -if task_type == 'INSTRUCTION_FINETUNE': - # check if train_data_path is a valid HF dataset url with splits. - # load dataset.info and see if HF tokens are correctly set. - check_HF_datasets() - -elif task_type == 'CONTINUED_PRETRAIN': - # check if train_data_path is a valid object store that composer supports +# build cfg from the inputs - # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" +if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': + # check if train_data_path is a valid HF dataset url with splits. + # load dataset.info and see if HF tokens are correctly set. + # check_HF_datasets() + + cfg, tokenizer = create_om_cfg(FT_API_args) + +elif FT_API_args.task_type == 'CONTINUED_PRETRAIN': + # check if train_data_path is a valid object store that composer supports + cfg, tokenizer = create_om_cfg(FT_API_args) + + input_folder = FT_API_args.train_data_path + output_folder = FT_API_args.save_folder + concat_tokens = FT_API_args.context_length + tokenizer_name = FT_API_args.model + + # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) convert_text_to_mds(tokenizer_name=args.tokenizer, output_folder=args.output_folder, @@ -640,18 +664,12 @@ def create_om_cfg(FT_API_args): processes=args.processes, reprocess=args.reprocess, args_str=_args_str(args)) - else: - raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {task_type} instead!") + raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") # Run a few checks on resulted MDS datasets # 1. no shards in output_folder # 2. check shard completeness by downloading and inspecting index.json -import torch -from omegaconf import OmegaConf as om -from llmfoundry.utils import build_tokenizer - -# build cfg from the inputs from llmfoundry.data.finetuning import build_finetuning_dataloader tokenizer_name = 'EleutherAI/gpt-neox-20b' @@ -664,141 +682,11 @@ def create_om_cfg(FT_API_args): total_tokens = 0 for batch in dataloader: - if len(batch['input_ids']) == 0 (check labels as well if exist): - raise Error + if len(batch['input_ids']) == 0: # (check labels as well if exist): + raise ValueError('input_ids is empty') - batch_tokens = batch['input_ids'] (add 'labels' as well if exist) - batch_token_count = sum(len(tokens) for tokens in batch_tokens) + batch_tokens = batch['input_ids'] # (add 'labels' as well if exist) + batch_token_count = sum([len(tokens) for tokens in batch_tokens]) total_tokens += batch_token_count print("Total number of tokens:", total_tokens) - -# COMMAND ---------- - - - -# COMMAND ---------- - - - -# COMMAND ---------- - - - -# COMMAND ---------- - - - -# COMMAND ---------- - - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC # DEPRECATED BELOW - -# COMMAND ---------- - -# If running on databricks notebook, the url can only be a Volume path. -# Make sure this is compliant to https://github.com/mosaicml/llm-foundry/blob/1191267195367b5ec6093ed7854b8f6daf1be2d3/llmfoundry/data/text_data.py#L174-L178 - -# raw dataset location you will point FT API to. -# It can be a local path or a remote path (s3/gcs/oci/dbfs:Volume) -dataset_url = 'tatsu-lab/alpaca' # "s3://xxxxx" or "HF name" -preprocessing_fn = 'llmfoundry.data.finetuning.tasks:alpaca_preprocessing_function' - -# dataset schema with tokens -tokenized_table_schema = {'tokens': bytes, 'id': np.int64} -tokenizer = 'EleutherAI/gpt-neox-20b' -tokenizer_kwargs = {'model_max_length': 2048} - -output_folder ='/Volumes/main/mosaic_hackathon/managed-volume/output' -input_folder = '' -eos_text = '<|endoftext|>' - - -# COMMAND ---------- - -if not dataset_url: - raise ValueError("dataset_url needs to be set at this point!") - -# COMMAND ---------- - - - -# COMMAND ---------- - -def check_cloud_datasets(dataset_url, job): - - suffix = '.txt' if job==Job.CPT else '.jsonl' - - object_store = maybe_create_object_store_from_uri(dataset_url) - - if object_store is not None: - _, _, folder_prefix = parse_uri(dataset_url) - names = [ - name for name in object_store.list_objects(folder_prefix) - if name.endswith(suffix) - ] - else: - # input_folder is a local folder - names = [ - text_file for dirpath, _, _ in os.walk(input_folder) - for text_file in glob(os.path.join(dirpath, '*.' + suffix)) - ] - assert len(names) > 0, f"No {suffix} files found in {dataset_url}." - return names - -check_cloud_datasets(dataset_url, job) - - -# COMMAND ---------- - -def validate_and_count_tokens(dataset_url, cfg, job): - if job == Job.IFT: - # for IFT, basic data processing to see (1) well-formed JSONL and (2) strip of empty tokens - import torch - from omegaconf import OmegaConf as om - from llmfoundry.utils import build_tokenizer - - tokenizer_name = 'EleutherAI/gpt-neox-20b' - tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} - tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - device_batch_size = 2 - dataloader = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader - - packing = cfg.dataset.get('packing_ratio') is not None - - for i, batch in enumerate(dataloader): - if i >= 5: - break - print(f'-----Batch {i}-----') - for k, v in batch.items(): - if isinstance(v, torch.Tensor): - print(k, v.shape) - else: - print(k, v) - else: # job == Job.CPT: - # for CPT, strip empty txt files - print("Make sure the script is running within llmfoundry") - convert_text_to_mds(tokenizer = tokenizer , - concat_tokens = tokenizer_kwargs['model_max_length'], - output_folder = output_folder, - input_folder = input_folder, - eos_text = '<|endoftext|>') - - dataset=StreamingDataset(local='/Volumes/datasets/default/byod/cpt_poc/output/') # output has the streaming shards - dataloader = DataLoader(dataset) - sample = next(iter(dataloader)) - b = np.asarray(sample['tokens']).tobytes() - token_ids = np.frombuffer(b, dtype=np.int64) - n_token_per_sample = len(token_ids) - print('total_tokens = ', n_token_per_sample * dataset.num_samples) - -validate_and_count_tokens() # print overall stats of dataset - -# COMMAND ---------- - - From 297e057d7aa746709df56d5017bb0da23b62b6ac Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Wed, 3 Jan 2024 07:51:25 +0000 Subject: [PATCH 15/63] change token count function --- .../data_prep/validate_and_tokenize_data.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 9dbb78de3b..6e4364bee6 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -139,6 +139,10 @@ def check_HF_datasets(dataset_names_with_splits): # COMMAND ---------- +from streaming.base.storage.upload import CloudUploader +from streaming.base.storage.download import download_file +import json + def integrity_check(out: Union[str, Tuple[str, str]]): """Check if the index file has integrity. @@ -565,9 +569,6 @@ def is_hf_dataset_path(path): return bool(re.match(pattern, path)) - - - # COMMAND ---------- # MAGIC %md @@ -677,16 +678,23 @@ def create_om_cfg(FT_API_args): tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) device_batch_size = 1 -dataloader = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader +dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) +dataloader = dataspec.dataloader +token_counting_func = dataspec.get_num_tokens_in_batch total_tokens = 0 for batch in dataloader: - if len(batch['input_ids']) == 0: # (check labels as well if exist): - raise ValueError('input_ids is empty') + total_tokens += token_counting_func(batch) + + # if len(batch['input_ids']) == 0: # (check labels as well if exist): + # raise ValueError('input_ids is empty') - batch_tokens = batch['input_ids'] # (add 'labels' as well if exist) - batch_token_count = sum([len(tokens) for tokens in batch_tokens]) - total_tokens += batch_token_count + # batch_tokens = batch['input_ids'] # (add 'labels' as well if exist) + # batch_token_count = sum([len(tokens) for tokens in batch_tokens]) + # total_tokens += batch_token_count print("Total number of tokens:", total_tokens) + +# COMMAND ---------- + + From 09d0ebbdd1d2ee03258313cebabd01bf29743e87 Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Fri, 5 Jan 2024 06:34:53 +0000 Subject: [PATCH 16/63] reorganize cells --- .../data_prep/validate_and_tokenize_data.py | 180 +++++++++--------- 1 file changed, 89 insertions(+), 91 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 6e4364bee6..b81811a5fb 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -83,6 +83,11 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC ## User Defines the Cell Below + +# COMMAND ---------- + FT_API_args = Namespace( model = 'EleutherAI/gpt-neox-20b', train_data_path = 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', @@ -103,79 +108,15 @@ future = False, ) -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Utility Functions - -# COMMAND ---------- - -def check_HF_datasets(dataset_names_with_splits): - from huggingface_hub import dataset_info - from datasets import get_dataset_split_names - import os - token = os.environ.get("HUGGING_FACE_HUB_TOKEN") - for dataset_name_with_split in dataset_names_with_splits: - dataset_name, split = os.path.split(dataset_name_with_split) - # make sure we have a dataset and split - if not dataset_name or not split: - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." - # check user access to the dataset - try: - info = dataset_info(dataset_name) - except: - token_warning = "" - if not token: - token_warning = " If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf." - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning - # check that split exists - try: - splits = get_dataset_split_names(dataset_name) - except: # error raised in the case of multiple subsets - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets." - if split not in splits: - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." - return True, "" +os.environ['HF_ASSETS_CACHE'] = '/tmp/' +os.environ['HF_HOME'] = '/tmp/' +os.environ['HF_HUB_CACHE'] = '/tmp/' +os.environ['HF_DATASETS_CACHE'] = '/tmp/' # COMMAND ---------- -from streaming.base.storage.upload import CloudUploader -from streaming.base.storage.download import download_file -import json - -def integrity_check(out: Union[str, Tuple[str, str]]): - """Check if the index file has integrity. - - If index is a cloud url, first download it to a temp local file. - - Args: - out (Union[str, Tuple[str,str]]): MDS dataset path - """ - - def get_expected(mds_root: str): - n_shard_files = 0 - cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) - for o in cu.list_objects(): - if o.endswith('.mds'): - n_shard_files += 1 - return n_shard_files - - cu = CloudUploader.get(out, keep_local=True, exist_ok=True) - - with tempfile.TemporaryDirectory() as temp_dir: - if cu.remote: - download_file(os.path.join(cu.remote, 'index.json'), - os.path.join(temp_dir, 'index.json'), - timeout=60) - expected_n_shard_files = get_expected(cu.remote) - local_merged_index_path = os.path.join(temp_dir, 'index.json') - else: - local_merged_index_path = os.path.join(cu.local, 'index.json') - expected_n_shard_files = get_expected(cu.local) - - merged_index = json.load(open(local_merged_index_path, 'r')) - n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) - assert n_shard_files == expected_n_shard_files, f'expected {expected_n_shard_files} shard files but got {n_shard_files}' +# MAGIC %md +# MAGIC ## Adapted from llmfoundry/scripts/data_prep/convert_text_to_mds.py # COMMAND ---------- @@ -555,6 +496,76 @@ def _args_str(original_args: Namespace) -> str: # COMMAND ---------- +# MAGIC %md +# MAGIC ## Validate Inputs and Count tokens + +# COMMAND ---------- + +from streaming.base.storage.upload import CloudUploader +from streaming.base.storage.download import download_file +import json + +def integrity_check(out: Union[str, Tuple[str, str]]): + """Check if the index file has integrity. + + If index is a cloud url, first download it to a temp local file. + + Args: + out (Union[str, Tuple[str,str]]): MDS dataset path + """ + + def count_shards(mds_root: str): + n_shard_files = 0 + cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) + for o in cu.list_objects(): + if o.endswith('.mds'): + n_shard_files += 1 + return n_shard_files + + cu = CloudUploader.get(out, keep_local=True, exist_ok=True) + + with tempfile.TemporaryDirectory() as temp_dir: + if cu.remote: + download_file(os.path.join(cu.remote, 'index.json'), + os.path.join(temp_dir, 'index.json'), + timeout=60) + actual_n_shard_files = count_shards(cu.remote) + local_merged_index_path = os.path.join(temp_dir, 'index.json') + else: + local_merged_index_path = os.path.join(cu.local, 'index.json') + actual_n_shard_files = count_shards(cu.local) + + merged_index = json.load(open(local_merged_index_path, 'r')) + n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) + return n_shard_files == actual_n_shard_files + +def check_HF_datasets(dataset_names_with_splits): + from huggingface_hub import dataset_info + from datasets import get_dataset_split_names + import os + token = os.environ.get("HUGGING_FACE_HUB_TOKEN") + for dataset_name_with_split in dataset_names_with_splits: + dataset_name, split = os.path.split(dataset_name_with_split) + # make sure we have a dataset and split + if not dataset_name or not split: + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + # check user access to the dataset + try: + info = dataset_info(dataset_name) + except: + token_warning = "" + if not token: + token_warning = " If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf." + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning + # check that split exists + try: + splits = get_dataset_split_names(dataset_name) + except: # error raised in the case of multiple subsets + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets." + if split not in splits: + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." + return True, "" + def is_hf_dataset_path(path): """Check if a given string is a dataset path used by Hugging Face. @@ -569,19 +580,6 @@ def is_hf_dataset_path(path): return bool(re.match(pattern, path)) -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Validate and token Count - -# COMMAND ---------- - -os.environ['HF_ASSETS_CACHE'] = '/tmp/' -os.environ['HF_HOME'] = '/tmp/' -os.environ['HF_HUB_CACHE'] = '/tmp/' -os.environ['HF_DATASETS_CACHE'] = '/tmp/' - - def create_om_cfg(FT_API_args): task_type = FT_API_args.task_type train_data_path = FT_API_args.train_data_path @@ -638,8 +636,10 @@ def create_om_cfg(FT_API_args): if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': # check if train_data_path is a valid HF dataset url with splits. + if not is_hf_dataset_path(FT_API_args.train_data_path): + raise ValueError(f"Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.") # load dataset.info and see if HF tokens are correctly set. - # check_HF_datasets() + check_HF_datasets(FT_API_args.train_data_path) cfg, tokenizer = create_om_cfg(FT_API_args) @@ -665,6 +665,11 @@ def create_om_cfg(FT_API_args): processes=args.processes, reprocess=args.reprocess, args_str=_args_str(args)) + + # Check if the MDS dataset is integral by checking index.json + if integrity_check(args.output_folder): + raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") + else: raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") # Run a few checks on resulted MDS datasets @@ -686,13 +691,6 @@ def create_om_cfg(FT_API_args): for batch in dataloader: total_tokens += token_counting_func(batch) - # if len(batch['input_ids']) == 0: # (check labels as well if exist): - # raise ValueError('input_ids is empty') - - # batch_tokens = batch['input_ids'] # (add 'labels' as well if exist) - # batch_token_count = sum([len(tokens) for tokens in batch_tokens]) - # total_tokens += batch_token_count - print("Total number of tokens:", total_tokens) # COMMAND ---------- From 460df65cd611f1161cd23cfa8eca051c97be405e Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 5 Jan 2024 00:42:12 -0800 Subject: [PATCH 17/63] Add unit tests --- .../data_prep/validate_and_tokenize_data.py | 140 +++++++++--------- .../test_validate_and_tokenize_data.py | 103 +++++++++++++ 2 files changed, 173 insertions(+), 70 deletions(-) create mode 100644 tests/a_scripts/data_prep/test_validate_and_tokenize_data.py diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index b81811a5fb..ae2d1129b3 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -64,7 +64,7 @@ # COMMAND ---------- -dbutils.library.restartPython() +# dbutils.library.restartPython() # COMMAND ---------- @@ -77,13 +77,14 @@ import numpy as np from omegaconf import OmegaConf as om from argparse import Namespace -from typing import Union, Tuple +from typing import Union, Tuple from llmfoundry.utils import build_tokenizer -import torch +from huggingface_hub import dataset_info +from datasets import get_dataset_split_names # COMMAND ---------- -# MAGIC %md +# MAGIC %md # MAGIC ## User Defines the Cell Below # COMMAND ---------- @@ -503,7 +504,7 @@ def _args_str(original_args: Namespace) -> str: from streaming.base.storage.upload import CloudUploader from streaming.base.storage.download import download_file -import json +import json def integrity_check(out: Union[str, Tuple[str, str]]): """Check if the index file has integrity. @@ -538,11 +539,8 @@ def count_shards(mds_root: str): merged_index = json.load(open(local_merged_index_path, 'r')) n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) return n_shard_files == actual_n_shard_files - + def check_HF_datasets(dataset_names_with_splits): - from huggingface_hub import dataset_info - from datasets import get_dataset_split_names - import os token = os.environ.get("HUGGING_FACE_HUB_TOKEN") for dataset_name_with_split in dataset_names_with_splits: dataset_name, split = os.path.split(dataset_name_with_split) @@ -565,7 +563,7 @@ def check_HF_datasets(dataset_names_with_splits): if split not in splits: return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." return True, "" - + def is_hf_dataset_path(path): """Check if a given string is a dataset path used by Hugging Face. @@ -576,7 +574,7 @@ def is_hf_dataset_path(path): bool: True if the string is a dataset path, False otherwise. """ # Regular expression to match the dataset path pattern - pattern = r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+(/[\w]+)?/?$" + pattern = r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$" return bool(re.match(pattern, path)) @@ -627,72 +625,74 @@ def create_om_cfg(FT_API_args): tokenizer_name=model, tokenizer_kwargs={'model_max_length': max_seq_len}, ) - + return cfg, tokenizer # COMMAND ---------- # build cfg from the inputs +def main(): + if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': + # check if train_data_path is a valid HF dataset url with splits. + if not is_hf_dataset_path(FT_API_args.train_data_path): + raise ValueError(f"Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.") + # load dataset.info and see if HF tokens are correctly set. + check_HF_datasets(FT_API_args.train_data_path) + + cfg, tokenizer = create_om_cfg(FT_API_args) + + elif FT_API_args.task_type == 'CONTINUED_PRETRAIN': + # check if train_data_path is a valid object store that composer supports + cfg, tokenizer = create_om_cfg(FT_API_args) + + input_folder = FT_API_args.train_data_path + output_folder = FT_API_args.save_folder + concat_tokens = FT_API_args.context_length + tokenizer_name = FT_API_args.model + + # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" + args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) + convert_text_to_mds(tokenizer_name=args.tokenizer, + output_folder=args.output_folder, + input_folder=args.input_folder, + concat_tokens=args.concat_tokens, + eos_text=args.eos_text, + bos_text=args.bos_text, + no_wrap=args.no_wrap, + compression=args.compression, + processes=args.processes, + reprocess=args.reprocess, + args_str=_args_str(args)) + + # Check if the MDS dataset is integral by checking index.json + if integrity_check(args.output_folder): + raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") + + else: + raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") + # Run a few checks on resulted MDS datasets + # 1. no shards in output_folder + # 2. check shard completeness by downloading and inspecting index.json + + + from llmfoundry.data.finetuning import build_finetuning_dataloader + tokenizer_name = 'EleutherAI/gpt-neox-20b' + tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} + tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + + device_batch_size = 1 + dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) + dataloader = dataspec.dataloader + token_counting_func = dataspec.get_num_tokens_in_batch + + total_tokens = 0 + for batch in dataloader: + total_tokens += token_counting_func(batch) -if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': - # check if train_data_path is a valid HF dataset url with splits. - if not is_hf_dataset_path(FT_API_args.train_data_path): - raise ValueError(f"Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.") - # load dataset.info and see if HF tokens are correctly set. - check_HF_datasets(FT_API_args.train_data_path) - - cfg, tokenizer = create_om_cfg(FT_API_args) - -elif FT_API_args.task_type == 'CONTINUED_PRETRAIN': - # check if train_data_path is a valid object store that composer supports - cfg, tokenizer = create_om_cfg(FT_API_args) - - input_folder = FT_API_args.train_data_path - output_folder = FT_API_args.save_folder - concat_tokens = FT_API_args.context_length - tokenizer_name = FT_API_args.model - - # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" - args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) - convert_text_to_mds(tokenizer_name=args.tokenizer, - output_folder=args.output_folder, - input_folder=args.input_folder, - concat_tokens=args.concat_tokens, - eos_text=args.eos_text, - bos_text=args.bos_text, - no_wrap=args.no_wrap, - compression=args.compression, - processes=args.processes, - reprocess=args.reprocess, - args_str=_args_str(args)) - - # Check if the MDS dataset is integral by checking index.json - if integrity_check(args.output_folder): - raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") - -else: - raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") - # Run a few checks on resulted MDS datasets - # 1. no shards in output_folder - # 2. check shard completeness by downloading and inspecting index.json - - -from llmfoundry.data.finetuning import build_finetuning_dataloader -tokenizer_name = 'EleutherAI/gpt-neox-20b' -tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} -tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - -device_batch_size = 1 -dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) -dataloader = dataspec.dataloader -token_counting_func = dataspec.get_num_tokens_in_batch - -total_tokens = 0 -for batch in dataloader: - total_tokens += token_counting_func(batch) - -print("Total number of tokens:", total_tokens) + print("Total number of tokens:", total_tokens) # COMMAND ---------- +if __name__ == '__main__': + main() diff --git a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py new file mode 100644 index 0000000000..5b3b5b561b --- /dev/null +++ b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py @@ -0,0 +1,103 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 +from unittest.mock import Mock, patch, MagicMock, mock_open +from argparse import Namespace +from scripts.data_prep.validate_and_tokenize_data import integrity_check, check_HF_datasets, is_hf_dataset_path, create_om_cfg +from streaming.base.storage.upload import CloudUploader +from transformers import AutoTokenizer + +class MockCloudUploader: + def __init__(self): + self.remote = "some_remote_path" + self.local = "some_local_path" + + def list_objects(self): + return ['shard1.mds', 'shard2.mds'] + +class MockDatasetInfo: + def __init__(self): + self.id = "valid_dataset" + self.description = "A mock dataset description" + +@patch('scripts.data_prep.validate_and_tokenize_data.CloudUploader.get') +@patch('scripts.data_prep.validate_and_tokenize_data.download_file') +@patch('scripts.data_prep.validate_and_tokenize_data.json.load') +@patch('builtins.open', new_callable=mock_open, read_data='{"shards": [{"raw_data": {"basename": "shard1.mds"}}, {"raw_data": {"basename": "shard2.mds"}}]}') +def test_integrity_check(mock_file_open, mock_json_load, mock_download_file, mock_cloud_uploader): + # Setup mocks + mock_cloud_uploader.return_value = MockCloudUploader() + mock_json_load.return_value = {'shards': [{'raw_data': {'basename': 'shard1.mds'}}, {'raw_data': {'basename': 'shard2.mds'}}]} + + # Test case where integrity is valid + assert integrity_check('mock_dataset_path') + + # Test case where integrity is invalid + # Modify the mock to simulate a different scenario + mock_json_load.return_value = {'shards': [{'raw_data': {'basename': 'shard1.mds'}}]} # less shards than expected + assert not integrity_check('mock_dataset_path') + +# Additional tests can be written for cases like remote URL, file not found, etc. + + + +@patch('scripts.data_prep.validate_and_tokenize_data.dataset_info') +@patch('scripts.data_prep.validate_and_tokenize_data.get_dataset_split_names') +def test_check_HF_datasets(mock_get_splits, mock_dataset_info): + # Setup mocks + mock_get_splits.return_value = ['train', 'test'] + mock_dataset_info.return_value = MockDatasetInfo() + + # Test valid dataset with valid split + result, message = check_HF_datasets(['valid_dataset/train']) + assert result + + # Test valid dataset with invalid split + result, message = check_HF_datasets(['valid_dataset/invalid_split']) + assert not result + + # Test invalid dataset + mock_dataset_info.side_effect = Exception("Dataset not found") + result, message = check_HF_datasets(['invalid_dataset/train']) + assert not result + +# Additional tests for private datasets, token issues, etc. + + + +def test_is_hf_dataset_path(): + # Valid dataset paths + assert is_hf_dataset_path('user/dataset/train') + assert is_hf_dataset_path('user/dataset') + + # Invalid dataset paths + assert not is_hf_dataset_path('user@dataset/train') + assert not is_hf_dataset_path('just_dataset_name') + assert not is_hf_dataset_path('user/dataset/unknown_split/') + + +@patch('transformers.AutoTokenizer.from_pretrained') +def test_create_om_cfg_instruction_finetune(mock_from_pretrained): + mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) + args = Namespace( + task_type='INSTRUCTION_FINETUNE', + train_data_path='hf_dataset/train', + model='model_name', + context_length=512 + ) + cfg, tokenizer = create_om_cfg(args) + assert cfg.dataset.hf_name == 'hf_dataset/train' + assert cfg.dataset.max_seq_len == 512 + +@patch('transformers.AutoTokenizer.from_pretrained') +def test_create_om_cfg_continued_pretrain(mock_from_pretrained): + mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) + args = Namespace( + task_type='CONTINUED_PRETRAIN', + train_data_path='object_store_path', + model='model_name', + context_length=512 + ) + cfg, tokenizer = create_om_cfg(args) + assert cfg.dataset.remote == 'object_store_path' + assert cfg.dataset.max_seq_len == 512 + From 3ffd200391e1d357b4c7036d21e04d0e59809f71 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Sat, 6 Jan 2024 14:03:11 -0800 Subject: [PATCH 18/63] Add a printout for CPT --- scripts/data_prep/validate_and_tokenize_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index ae2d1129b3..d647060324 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -668,6 +668,8 @@ def main(): if integrity_check(args.output_folder): raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") + print("Converted data for continnued pre-training was saved in: ", args.output_folder) + else: raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") # Run a few checks on resulted MDS datasets From 9362886aceead3e48e65cbad1b45a24ba5ea6b08 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Sat, 6 Jan 2024 14:20:00 -0800 Subject: [PATCH 19/63] update question --- scripts/data_prep/validate_and_tokenize_data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index d647060324..30ba2e4456 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -56,6 +56,7 @@ # MAGIC timeout: Optional[float] = 10, # MAGIC future: Literal[False] = False, # MAGIC } +# MAGIC - What null checkings do we want to have? # MAGIC ``` # COMMAND ---------- From 898e5acfda2a85524046ab0b3a63345573ef022a Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Mon, 8 Jan 2024 05:49:17 +0000 Subject: [PATCH 20/63] Add questions --- scripts/data_prep/validate_and_tokenize_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 30ba2e4456..dfa47b946b 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -57,6 +57,8 @@ # MAGIC future: Literal[False] = False, # MAGIC } # MAGIC - What null checkings do we want to have? +# MAGIC - How to map the model to its expected eos_text / bos_text format? [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703644153357929?thread_ts=1703643155.904289&cid=C05K29T9NBF) +# MAGIC - How to automate tokenization for CPT? it is always really standard: sequence -> concat(tok(BOS), tok(sequence), tok(EOS)), and then concatenate sequences. [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703698056000399?thread_ts=1703643155.904289&cid=C05K29T9NBF) # MAGIC ``` # COMMAND ---------- From a4bef7115327d66b4a529db96e0fced078997c60 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Sun, 7 Jan 2024 23:48:20 -0800 Subject: [PATCH 21/63] Fix lints --- .../data_prep/validate_and_tokenize_data.py | 169 ++++++++++-------- .../test_validate_and_tokenize_data.py | 98 ++++++---- 2 files changed, 160 insertions(+), 107 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index dfa47b946b..de369af59d 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -1,3 +1,6 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + # Databricks notebook source # MAGIC %md # MAGIC JIRA: https://databricks.atlassian.net/jira/software/c/projects/STR/issues/STR-141?filter=allissues @@ -73,17 +76,16 @@ import os import re -from enum import Enum -from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) -from torch.utils.data import DataLoader -from streaming import StreamingDataset -import numpy as np +from argparse import ArgumentParser, Namespace +from typing import Tuple, Union + +from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, + parse_uri) +from datasets import get_dataset_split_names +from huggingface_hub import dataset_info from omegaconf import OmegaConf as om -from argparse import Namespace -from typing import Union, Tuple + from llmfoundry.utils import build_tokenizer -from huggingface_hub import dataset_info -from datasets import get_dataset_split_names # COMMAND ---------- @@ -93,23 +95,28 @@ # COMMAND ---------- FT_API_args = Namespace( - model = 'EleutherAI/gpt-neox-20b', - train_data_path = 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', - save_folder = 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', - task_type = "INSTRUCTION_FINETUNE", - eval_data_path = None, - eval_prompts = None, - custom_weights_path = None, - training_duration = None, - learning_rate = None, - context_length = 2048, - experiment_trackers = None, - disable_credentials_check = None, + model='EleutherAI/gpt-neox-20b', + train_data_path= + 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', + save_folder= + 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', + task_type='INSTRUCTION_FINETUNE', + eval_data_path=None, + eval_prompts=None, + custom_weights_path=None, + training_duration=None, + learning_rate=None, + context_length=2048, + experiment_trackers=None, + disable_credentials_check=None, # Extra argument to add to FT API # See comment https://databricks.atlassian.net/browse/STR-141?focusedCommentId=4308948 - data_prep_config = {'data_validation': True, 'data_prep': False}, - timeout = 10, - future = False, + data_prep_config={ + 'data_validation': True, + 'data_prep': False + }, + timeout=10, + future=False, ) os.environ['HF_ASSETS_CACHE'] = '/tmp/' @@ -131,14 +138,12 @@ import logging import math -import os import tempfile -from argparse import ArgumentParser, Namespace +from argparse import Namespace from concurrent.futures import ProcessPoolExecutor from glob import glob from typing import Iterable, List, Tuple, cast -import psutil from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) from streaming import MDSWriter @@ -153,27 +158,33 @@ DONE_FILENAME = '.text_to_mds_conversion_done' -def parse_args( tokenizer, - concat_tokens, - output_folder, - input_folder, - compression = 'zstd', - bos_text = '', - eos_text = '', - no_wrap = False , - processes = 32, # min(max(psutil.cpu_count() - 2, 1), 32), - reprocess = False ) -> Namespace: - - parsed = Namespace(tokenizer = tokenizer, - concat_tokens = model_max_length, - output_folder = output_folder, - input_folder = input_folder, - eos_text = eos_text, - bos_text = bos_text, - no_wrap = no_wrap, - compression = compression, - processes = processes, - reprocess = reprocess) +def parse_args( + tokenizer: str, + concat_tokens: int, + output_folder: str, + input_folder: str, + compression: str = 'zstd', + bos_text: str = '', + eos_text: str = '', + no_wrap: bool = False, + processes: int = 32, # min(max(psutil.cpu_count() - 2, 1), 32), + reprocess: bool = False +) -> Namespace: + + parser = ArgumentParser( + description= + 'Convert text files into MDS format, optionally concatenating and tokenizing', + ) + parsed = Namespace(tokenizer=tokenizer, + concat_tokens=concat_tokens, + output_folder=output_folder, + input_folder=input_folder, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + compression=compression, + processes=processes, + reprocess=reprocess) # Make sure we have needed concat options if (parsed.concat_tokens is not None and @@ -505,10 +516,12 @@ def _args_str(original_args: Namespace) -> str: # COMMAND ---------- -from streaming.base.storage.upload import CloudUploader -from streaming.base.storage.download import download_file import json +from streaming.base.storage.download import download_file +from streaming.base.storage.upload import CloudUploader + + def integrity_check(out: Union[str, Tuple[str, str]]): """Check if the index file has integrity. @@ -540,11 +553,13 @@ def count_shards(mds_root: str): actual_n_shard_files = count_shards(cu.local) merged_index = json.load(open(local_merged_index_path, 'r')) - n_shard_files = len({b['raw_data']['basename'] for b in merged_index['shards']}) + n_shard_files = len( + {b['raw_data']['basename'] for b in merged_index['shards']}) return n_shard_files == actual_n_shard_files -def check_HF_datasets(dataset_names_with_splits): - token = os.environ.get("HUGGING_FACE_HUB_TOKEN") + +def check_HF_datasets(dataset_names_with_splits: list): + token = os.environ.get('HUGGING_FACE_HUB_TOKEN') for dataset_name_with_split in dataset_names_with_splits: dataset_name, split = os.path.split(dataset_name_with_split) # make sure we have a dataset and split @@ -552,22 +567,23 @@ def check_HF_datasets(dataset_names_with_splits): return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." # check user access to the dataset try: - info = dataset_info(dataset_name) + _ = dataset_info(dataset_name) except: - token_warning = "" + token_warning = '' if not token: - token_warning = " If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf." + token_warning = ' If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf.' return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning # check that split exists try: splits = get_dataset_split_names(dataset_name) except: # error raised in the case of multiple subsets - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets." + return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets.' if split not in splits: - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found." - return True, "" + return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found.' + return True, '' -def is_hf_dataset_path(path): + +def is_hf_dataset_path(path: str): """Check if a given string is a dataset path used by Hugging Face. Args: @@ -577,11 +593,12 @@ def is_hf_dataset_path(path): bool: True if the string is a dataset path, False otherwise. """ # Regular expression to match the dataset path pattern - pattern = r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$" + pattern = r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$' return bool(re.match(pattern, path)) -def create_om_cfg(FT_API_args): + +def create_om_cfg(FT_API_args: Namespace): task_type = FT_API_args.task_type train_data_path = FT_API_args.train_data_path model = FT_API_args.model @@ -631,14 +648,18 @@ def create_om_cfg(FT_API_args): return cfg, tokenizer + # COMMAND ---------- + # build cfg from the inputs def main(): if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': # check if train_data_path is a valid HF dataset url with splits. if not is_hf_dataset_path(FT_API_args.train_data_path): - raise ValueError(f"Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.") + raise ValueError( + f'Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.' + ) # load dataset.info and see if HF tokens are correctly set. check_HF_datasets(FT_API_args.train_data_path) @@ -669,16 +690,20 @@ def main(): # Check if the MDS dataset is integral by checking index.json if integrity_check(args.output_folder): - raise RuntimeError(f"{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!") + raise RuntimeError( + f'{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!' + ) - print("Converted data for continnued pre-training was saved in: ", args.output_folder) + print('Converted data for continnued pre-training was saved in: ', + args.output_folder) else: - raise ValueError(f"task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!") - # Run a few checks on resulted MDS datasets - # 1. no shards in output_folder - # 2. check shard completeness by downloading and inspecting index.json - + raise ValueError( + f'task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!' + ) + # Run a few checks on resulted MDS datasets + # 1. no shards in output_folder + # 2. check shard completeness by downloading and inspecting index.json from llmfoundry.data.finetuning import build_finetuning_dataloader tokenizer_name = 'EleutherAI/gpt-neox-20b' @@ -694,10 +719,10 @@ def main(): for batch in dataloader: total_tokens += token_counting_func(batch) - print("Total number of tokens:", total_tokens) + print('Total number of tokens:', total_tokens) -# COMMAND ---------- +# COMMAND ---------- if __name__ == '__main__': main() diff --git a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py index 5b3b5b561b..8a78581fef 100644 --- a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py +++ b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py @@ -1,67 +1,99 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import Mock, patch, MagicMock, mock_open from argparse import Namespace -from scripts.data_prep.validate_and_tokenize_data import integrity_check, check_HF_datasets, is_hf_dataset_path, create_om_cfg -from streaming.base.storage.upload import CloudUploader +from typing import Any +from unittest.mock import MagicMock, mock_open, patch + from transformers import AutoTokenizer +from scripts.data_prep.validate_and_tokenize_data import (check_HF_datasets, + create_om_cfg, + integrity_check, + is_hf_dataset_path) + + class MockCloudUploader: + def __init__(self): - self.remote = "some_remote_path" - self.local = "some_local_path" + self.remote = 'some_remote_path' + self.local = 'some_local_path' def list_objects(self): return ['shard1.mds', 'shard2.mds'] + class MockDatasetInfo: + def __init__(self): - self.id = "valid_dataset" - self.description = "A mock dataset description" + self.id = 'valid_dataset' + self.description = 'A mock dataset description' + @patch('scripts.data_prep.validate_and_tokenize_data.CloudUploader.get') @patch('scripts.data_prep.validate_and_tokenize_data.download_file') @patch('scripts.data_prep.validate_and_tokenize_data.json.load') -@patch('builtins.open', new_callable=mock_open, read_data='{"shards": [{"raw_data": {"basename": "shard1.mds"}}, {"raw_data": {"basename": "shard2.mds"}}]}') -def test_integrity_check(mock_file_open, mock_json_load, mock_download_file, mock_cloud_uploader): +@patch( + 'builtins.open', + new_callable=mock_open, + read_data= + '{"shards": [{"raw_data": {"basename": "shard1.mds"}}, {"raw_data": {"basename": "shard2.mds"}}]}' +) +def test_integrity_check(mock_file_open: Any, mock_json_load: Any, + mock_download_file: Any, mock_cloud_uploader: Any): # Setup mocks mock_cloud_uploader.return_value = MockCloudUploader() - mock_json_load.return_value = {'shards': [{'raw_data': {'basename': 'shard1.mds'}}, {'raw_data': {'basename': 'shard2.mds'}}]} + mock_json_load.return_value = { + 'shards': [{ + 'raw_data': { + 'basename': 'shard1.mds' + } + }, { + 'raw_data': { + 'basename': 'shard2.mds' + } + }] + } # Test case where integrity is valid assert integrity_check('mock_dataset_path') # Test case where integrity is invalid # Modify the mock to simulate a different scenario - mock_json_load.return_value = {'shards': [{'raw_data': {'basename': 'shard1.mds'}}]} # less shards than expected + mock_json_load.return_value = { + 'shards': [{ + 'raw_data': { + 'basename': 'shard1.mds' + } + }] + } # less shards than expected assert not integrity_check('mock_dataset_path') -# Additional tests can be written for cases like remote URL, file not found, etc. +# Additional tests can be written for cases like remote URL, file not found, etc. @patch('scripts.data_prep.validate_and_tokenize_data.dataset_info') @patch('scripts.data_prep.validate_and_tokenize_data.get_dataset_split_names') -def test_check_HF_datasets(mock_get_splits, mock_dataset_info): +def test_check_HF_datasets(mock_get_splits: Any, mock_dataset_info: Any): # Setup mocks mock_get_splits.return_value = ['train', 'test'] mock_dataset_info.return_value = MockDatasetInfo() # Test valid dataset with valid split - result, message = check_HF_datasets(['valid_dataset/train']) + result, _ = check_HF_datasets(['valid_dataset/train']) assert result # Test valid dataset with invalid split - result, message = check_HF_datasets(['valid_dataset/invalid_split']) + result, _ = check_HF_datasets(['valid_dataset/invalid_split']) assert not result # Test invalid dataset - mock_dataset_info.side_effect = Exception("Dataset not found") - result, message = check_HF_datasets(['invalid_dataset/train']) + mock_dataset_info.side_effect = Exception('Dataset not found') + result, _ = check_HF_datasets(['invalid_dataset/train']) assert not result -# Additional tests for private datasets, token issues, etc. +# Additional tests for private datasets, token issues, etc. def test_is_hf_dataset_path(): @@ -76,28 +108,24 @@ def test_is_hf_dataset_path(): @patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_instruction_finetune(mock_from_pretrained): +def test_create_om_cfg_instruction_finetune(mock_from_pretrained: Any): mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace( - task_type='INSTRUCTION_FINETUNE', - train_data_path='hf_dataset/train', - model='model_name', - context_length=512 - ) - cfg, tokenizer = create_om_cfg(args) + args = Namespace(task_type='INSTRUCTION_FINETUNE', + train_data_path='hf_dataset/train', + model='model_name', + context_length=512) + cfg, _ = create_om_cfg(args) assert cfg.dataset.hf_name == 'hf_dataset/train' assert cfg.dataset.max_seq_len == 512 + @patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_continued_pretrain(mock_from_pretrained): +def test_create_om_cfg_continued_pretrain(mock_from_pretrained: Any): mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace( - task_type='CONTINUED_PRETRAIN', - train_data_path='object_store_path', - model='model_name', - context_length=512 - ) - cfg, tokenizer = create_om_cfg(args) + args = Namespace(task_type='CONTINUED_PRETRAIN', + train_data_path='object_store_path', + model='model_name', + context_length=512) + cfg, _ = create_om_cfg(args) assert cfg.dataset.remote == 'object_store_path' assert cfg.dataset.max_seq_len == 512 - From 4ca9cc6cd5c6a91f2d035808e92446ff835535db Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Mon, 8 Jan 2024 11:21:49 -0800 Subject: [PATCH 22/63] update format --- scripts/data_prep/validate_and_tokenize_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index de369af59d..9772f7662c 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -1,7 +1,8 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -# Databricks notebook source +# COMMAND ---------- + # MAGIC %md # MAGIC JIRA: https://databricks.atlassian.net/jira/software/c/projects/STR/issues/STR-141?filter=allissues From d636a0f82661f79c9aba01ecd9369c5a9ac6c69c Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Mon, 8 Jan 2024 11:27:25 -0800 Subject: [PATCH 23/63] update --- scripts/data_prep/validate_and_tokenize_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 9772f7662c..5f222b5466 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -1,5 +1,7 @@ -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 +# Databricks notebook source +# MAGIC %md +# MAGIC Copyright 2022 MosaicML LLM Foundry authors +# MAGIC SPDX-License-Identifier: Apache-2.0 # COMMAND ---------- From 827d1551d656731a686d26764161413cd9a7ff51 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Mon, 8 Jan 2024 11:29:38 -0800 Subject: [PATCH 24/63] nb source --- scripts/data_prep/validate_and_tokenize_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 5f222b5466..3b6c109199 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -1,6 +1,6 @@ # Databricks notebook source # MAGIC %md -# MAGIC Copyright 2022 MosaicML LLM Foundry authors +# MAGIC Copyright 2022 MosaicML LLM Foundry authors. # MAGIC SPDX-License-Identifier: Apache-2.0 # COMMAND ---------- From 6bbf3fced4f7ecda3c69e107f35147dde1040b18 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Mon, 8 Jan 2024 13:24:27 -0800 Subject: [PATCH 25/63] Remove license insert for validation notebook --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4c8cc699c..a7a3f62275 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -66,6 +66,7 @@ repos: - --comment-style - '#' types: [python] + exclude: scripts/data_prep/validate_and_tokenize_data.py - repo: https://github.com/PyCQA/docformatter rev: v1.5.0 hooks: From 5966b68b089244031276cd966ba87bafc56c6a07 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Wed, 10 Jan 2024 16:32:15 -0800 Subject: [PATCH 26/63] Add validation utils --- llmfoundry/utils/validation_utils.py | 634 +++++++++++++++++++++++++++ 1 file changed, 634 insertions(+) create mode 100644 llmfoundry/utils/validation_utils.py diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py new file mode 100644 index 0000000000..dc5fa66242 --- /dev/null +++ b/llmfoundry/utils/validation_utils.py @@ -0,0 +1,634 @@ +import os +import re +import json +import tempfile +import numpy as np +import pandas as pd +from collections import defaultdict +from omegaconf import OmegaConf as om +from argparse import ArgumentParser, Namespace +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import datasets +from datasets import get_dataset_split_names +from huggingface_hub import dataset_info + +from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) +from llmfoundry.utils import build_tokenizer +from llmfoundry.data import ConcatTokensDataset + +from streaming.base.storage.download import download_file +from streaming.base.storage.upload import CloudUploader +from streaming.base.converters import dataframe_to_mds + + +def create_om_cfg(FT_API_args: Namespace): + task_type = FT_API_args.task_type + + train_data_path = FT_API_args.train_data_path + split = 'train' + + if is_hf_dataset_path(FT_API_args.train_data_path): + train_data_path, split = '/'.join(FT_API_args.train_data_path.split('/')[:2]), FT_API_args.train_data_path.split('/')[-1] + + model = FT_API_args.model + max_seq_len = FT_API_args.context_length + + common_args = { + 'drop_last': False, + 'num_workers': 2, + 'prefetch_factor': 2, + 'pin_memory': False, + 'persistent_workers': False, + 'timeout': 0 + } + if task_type == 'INSTRUCTION_FINETUNE': + cfg = om.create({ + 'dataset': { + 'hf_name': train_data_path, + 'split': split, + 'max_seq_len': max_seq_len, + 'decoder_only_format': True, + 'allow_pad_trimming': False, + 'shuffle': True, + }, + **common_args + }) + + else: + cfg = om.create({ + 'name': 'finetuning', + 'dataset': { + 'remote': train_data_path, + 'local': train_data_path, + 'split': split, + 'max_seq_len': max_seq_len, + 'decoder_only_format': True, + 'allow_pad_trimming': False, + 'packing_ratio': None, + 'shuffle': True, + }, + **common_args + }) + + tokenizer = build_tokenizer( + tokenizer_name=model, + tokenizer_kwargs={'model_max_length': max_seq_len}, + ) + + return cfg, tokenizer + +def token_counts_and_validation(FT_API_args): + from llmfoundry.data.finetuning import build_finetuning_dataloader + + cfg, tokenizer = create_om_cfg(FT_API_args) + + device_batch_size = 1 + dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) + dataloader = dataspec.dataloader + token_counting_func = dataspec.get_num_tokens_in_batch + + total_tokens = [] + for batch in dataloader: + n_batch_tokens = token_counting_func(batch) + if n_batch_tokens == 0: + raise ValueError("Empty train sample") + total_tokens.append(n_batch_tokens) + return total_tokens + + +def check_HF_datasets(dataset_names_with_splits: list): + token = os.environ.get('HUGGING_FACE_HUB_TOKEN') + for dataset_name_with_split in dataset_names_with_splits: + dataset_name, split = os.path.split(dataset_name_with_split) + # make sure we have a dataset and split + if not dataset_name or not split: + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + # check user access to the dataset + try: + _ = dataset_info(dataset_name) + except: + token_warning = '' + if not token: + token_warning = ' If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf.' + return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning + # check that split exists + try: + splits = get_dataset_split_names(dataset_name) + except: # error raised in the case of multiple subsets + return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets.' + if split not in splits: + return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found.' + return True, '' + + +def is_hf_dataset_path(path: str): + """Check if a given string is a dataset path used by Hugging Face. + + Args: + path (str): The string to be checked. + + Returns: + bool: True if the string is a dataset path, False otherwise. + """ + # Regular expression to match the dataset path pattern + pattern = r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$' + + return bool(re.match(pattern, path)) + +def is_uc_delta_table(name: str): + """name is in the form of catalog.scheme.tablename + + Args: + name (str): a string folder/file/table path + Return: + (bool): True if name is valid UC delta table format + """ + return '.' in name and '/' not in name and '\\' not in name and len(name.split('.'))==3 + +def pandas_processing_fn(df: pd.DataFrame, + **args: Any) -> Iterable[Dict[str, bytes]]: + """Tokenize helper function for dataframe_to_mds. + + Args: + df (pandas.DataFrame): The input pandas DataFrame that needs to be processed. + **args : Additional arguments to be passed to the 'process_some_data' function during processing. + + Returns: + iterable obj + """ + hf_dataset = hf_datasets.Dataset.from_pandas(df=df) + tokenizer = AutoTokenizer.from_pretrained(args['tokenizer']) + tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace + dataset = ConcatTokensDataset( + hf_dataset=hf_dataset, + max_length=args.get('concat_tokens', None), + tokenizer=tokenizer, + eos_text=args.get('eos_text', None), + bos_text=args.get('bos_text', None), + no_wrap=args.get('no_wrap', None), + ) + + for sample in dataset: # pyright: ignore + yield sample + +def integrity_check(out: Union[str, Tuple[str, str]]): + """Check if the index file has integrity. + + If index is a cloud url, first download it to a temp local file. + + Args: + out (Union[str, Tuple[str,str]]): MDS dataset path + """ + + def count_shards(mds_root: str): + n_shard_files = 0 + cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) + for o in cu.list_objects(): + if o.endswith('.mds'): + n_shard_files += 1 + return n_shard_files + + cu = CloudUploader.get(out, keep_local=True, exist_ok=True) + + with tempfile.TemporaryDirectory() as temp_dir: + if cu.remote: + download_file(os.path.join(cu.remote, 'index.json'), + os.path.join(temp_dir, 'index.json'), + timeout=60) + actual_n_shard_files = count_shards(cu.remote) + local_merged_index_path = os.path.join(temp_dir, 'index.json') + else: + local_merged_index_path = os.path.join(cu.local, 'index.json') + actual_n_shard_files = count_shards(cu.local) + + merged_index = json.load(open(local_merged_index_path, 'r')) + n_shard_files = len( + {b['raw_data']['basename'] for b in merged_index['shards']}) + return n_shard_files == actual_n_shard_files + + + +import logging +import math +import os +import tempfile +from argparse import ArgumentParser, Namespace +from concurrent.futures import ProcessPoolExecutor +from glob import glob +from typing import Iterable, List, Tuple, cast + +import psutil +from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, + parse_uri) +from streaming import MDSWriter +from tqdm import tqdm +from transformers import AutoTokenizer + +from llmfoundry.data import ConcatTokensDataset +from llmfoundry.utils.data_prep_utils import (DownloadingIterable, + merge_shard_groups) + +log = logging.getLogger(__name__) +DONE_FILENAME = '.text_to_mds_conversion_done' + + +def parse_args() -> Namespace: + """Parse commandline arguments.""" + parser = ArgumentParser( + description= + 'Convert text files into MDS format, optionally concatenating and tokenizing', + ) + parser.add_argument( + '--output_folder', + type=str, + required=True, + help='The folder to write output to', + ) + parser.add_argument( + '--input_folder', + type=str, + required=True, + help='The folder with text files to convert to mds', + ) + parser.add_argument( + '--compression', + type=str, + default='zstd', + help='The compression algorithm to use for MDS writing', + ) + + parser.add_argument( + '--concat_tokens', + type=int, + help='Convert text to tokens and concatenate up to this many tokens', + ) + + parser.add_argument( + '--tokenizer', + type=str, + help='The name of the tokenizer to use', + ) + parser.add_argument( + '--bos_text', + type=str, + required=False, + default=None, + help= + 'The text to prepend to each example to separate concatenated examples', + ) + parser.add_argument( + '--eos_text', + type=str, + required=False, + default=None, + help= + 'The text to append to each example to separate concatenated examples', + ) + parser.add_argument( + '--no_wrap', + default=False, + action='store_true', + help= + 'Whether to let text examples wrap across multiple training examples', + ) + parser.add_argument( + '--processes', + type=int, + required=False, + default=min(max(psutil.cpu_count() - 2, 1), 32), + help= + 'The number of processes to use to download and convert the dataset', + ) + parser.add_argument( + '--reprocess', + type=bool, + required=False, + default=False, + help='If true, reprocess the input_folder to mds format. Otherwise, ' + + 'only reprocess upon changes to the input folder or dataset creation parameters.', + ) + + parsed = parser.parse_args() + + # Make sure we have needed concat options + if (parsed.concat_tokens is not None and + isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None): + parser.error( + 'When setting --concat_tokens, you must specify a --tokenizer') + + # now that we have validated them, change BOS/EOS to strings + if parsed.bos_text is None: + parsed.bos_text = '' + if parsed.eos_text is None: + parsed.eos_text = '' + return parsed + + +def get_object_names(input_folder: str) -> List[str]: + """Get object names from a local or remote folder. + + Args: + input_folder (str): local or remote folder path. + """ + object_store = maybe_create_object_store_from_uri(input_folder) + if object_store is not None: + _, _, folder_prefix = parse_uri(input_folder) + names = [ + name for name in object_store.list_objects(folder_prefix) + if name.endswith('.txt') + ] + else: + # input_folder is a local folder + names = [ + text_file for dirpath, _, _ in os.walk(input_folder) + for text_file in glob(os.path.join(dirpath, '*.txt')) + ] + # return names, sizes + log.info(f'Found {len(names)} text files at {input_folder}') + + return names + + +def get_task_args( + object_names: List[str], + output_root: str, + input_folder: str, + n_groups: int, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, +) -> Iterable: + """Get download_and_convert arguments split across n_groups. + + Each group handles a portion of object_names. + + Args: + object_names (List[str]): Names of objects to process + output_root (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + n_groups (int): Number of groups to split the object names into + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concantenate up to this many tokens + eos_text (str): Textend to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + """ + num_objects = len(object_names) + objs_per_group = math.ceil(num_objects / n_groups) + for group, i in enumerate(range(0, num_objects, objs_per_group)): + output_subdir = os.path.join(output_root, str(group)) + yield ( + object_names[i:min(i + objs_per_group, num_objects)], + output_subdir, + input_folder, + tokenizer_name, + concat_tokens, + eos_text, + bos_text, + no_wrap, + compression, + ) + + +def download_and_convert_starargs(args: Tuple): + """Helper function to call download_and_convert with star args. + + This helps us use download_and_convert with mutiprocessing. + """ + return download_and_convert(*args) + + +def download_and_convert( + file_names: List[str], + output_folder: str, + input_folder: str, + tokenizer_name: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, +): + """Downloads and converts text fies to MDS format. + + Args: + file_names (List[str]): Files to process + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + tokenizer_name (str): Name of tokenizer to use + concat_tokens (int): Concantenate up to this many tokens + eos_text (str): Textend to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + """ + object_store = maybe_create_object_store_from_uri(input_folder) + + # Download file_names + with tempfile.TemporaryDirectory() as tmp_dir: + downloading_iter = DownloadingIterable(object_names=file_names, + output_folder=tmp_dir, + object_store=object_store) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace + + # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up + # to the maximum sequence length + dataset = ConcatTokensDataset( + hf_dataset=downloading_iter, + max_length=concat_tokens, + tokenizer=tokenizer, + eos_text=eos_text, + bos_text=bos_text, + no_wrap=no_wrap, + ) + + columns = {'tokens': 'bytes'} + + log.info('Converting to MDS format...') + with MDSWriter(out=output_folder, + columns=columns, + compression=compression) as out: + for sample in tqdm(dataset): + out.write(sample) + + +def is_remote_path(path: str) -> bool: + """Checks whether a path is a remote path. + + Args: + path (str): path to check + """ + backend, _, _ = parse_uri(path) + return backend != '' + + +def is_already_processed(output_root: str, args_str: str, + object_names: List[str]) -> bool: + """Determines whether a group of text files has already been processed. + + Checks the done fie at output root to determine this. + + Args: + output_root (str): Output folder where a done file may exist + args_str (str): String representation of the arguments + object_names (List[str]): Names of objects to convert to MDS format + """ + # Retrieve the done file contents + output_object_store = maybe_create_object_store_from_uri(output_root) + if output_object_store is not None: + # Download and read the done file from the remote object store + _, _, output_folder_prefix = parse_uri(output_root) + try: + with tempfile.TemporaryDirectory() as tmp_dir: + done_file = os.path.join(tmp_dir, DONE_FILENAME) + output_object_store.download_object( + os.path.join(output_folder_prefix, DONE_FILENAME), + done_file) + with open(done_file) as df: + done_file_contents = df.read().splitlines() + except FileNotFoundError: + return False + else: + # Read the local done file + done_file = os.path.join(output_root, DONE_FILENAME) + if not os.path.isfile(done_file): + return False + with open(done_file) as df: + done_file_contents = df.read().splitlines() + # Compare the arguments + prev_args_str = done_file_contents[0] + if prev_args_str != args_str: + return False + + # Compare file names + prev_names = done_file_contents[1:] + if len(prev_names) != len(object_names): + return False + for idx, prev_name in enumerate(prev_names): + if object_names[idx] != prev_name: + return False + return True + + +def write_done_file(folder: str, args_str: str, object_names: List[str]): + """Write a file to signify completion. + + This the done file includes the arguments to processing and + a list of objects that were processed. + + Args: + folder (str): Folder to write the done file to + args_str (str): String representation of arguments + object_names (List[str]): List of objects to convert to MDS format + """ + with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: + done_file.write('\n'.join([args_str] + object_names) + '\n') + + +def convert_text_to_mds( + tokenizer_name: str, + output_folder: str, + input_folder: str, + concat_tokens: int, + eos_text: str, + bos_text: str, + no_wrap: bool, + compression: str, + processes: int, + args_str: str, + reprocess: bool, +): + """Convert a folder of text files to MDS format. + + Args: + tokenizer_name (str): Name of tokenizer to use + output_folder (str): Folder to write MDS shards to + input_folder (str): Folder of text files to process + concat_tokens (int): Concantenate up to this many tokens + eos_text (str): Textend to append to each example to separate concatenated samples + bos_text (str): Text to prepend to each example to separate concatenated samples + no_wrap: (bool): Whether to let text examples wrap across multiple training examples + compression (str): The compression algorithm to use for MDS writing + processes (int): The number of processes to use. + args_str (str): String representation of the arguments + reprocess (bool): Whether to always reprocess the given folder of text files + """ + is_remote_output = is_remote_path(output_folder) + + object_names = get_object_names(input_folder) + if len(object_names) == 0: + raise ValueError(f'No text files were found at {input_folder}.') + + # Check if the text files in the bucket have already been processed. + if not reprocess and is_already_processed(output_folder, args_str, + object_names): + log.info( + f'Input folder {input_folder} is already processed at {output_folder} and ' + + + 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.' + ) + return + + # Use a temporary local directory if the output is remote and there are more than 1 processes + local_output_folder = tempfile.TemporaryDirectory( + ).name if is_remote_output else output_folder + + if processes > 1: + # Download and convert the text files in parallel + args = get_task_args(object_names, local_output_folder, input_folder, + processes, tokenizer_name, concat_tokens, eos_text, + bos_text, no_wrap, compression) + with ProcessPoolExecutor(max_workers=processes) as executor: + list(executor.map(download_and_convert_starargs, args)) + + # Merge the mds shards from each of the processes into a single folder + merge_shard_groups(local_output_folder) + else: + download_and_convert(object_names, local_output_folder, input_folder, + tokenizer_name, concat_tokens, eos_text, bos_text, + no_wrap, compression) + + # Write a done file with the args and object names + write_done_file(local_output_folder, args_str, object_names) + + if is_remote_output: + # Upload the local output to the remote location + output_object_store = cast( + ObjectStore, maybe_create_object_store_from_uri(output_folder)) + _, _, output_folder_prefix = parse_uri(output_folder) + files_to_upload = os.listdir(local_output_folder) + + for file in files_to_upload: + assert not os.path.isdir(file) + remote_path = os.path.join(output_folder_prefix, file) + output_object_store.upload_object( + remote_path, os.path.join(local_output_folder, file)) + + +def _args_str(original_args: Namespace) -> str: + """Create a string from the args to determine whether to reprocess. + + Args: + original_args (Namespace): Arguments to main function. + """ + # Take the arguments that influence the final result. + # reprocess and max_mds_writer_workers are not taken. + args = Namespace( + tokenizer_name=original_args.tokenizer, + output_folder=original_args.output_folder, + input_folder=original_args.input_folder, + concat_tokens=original_args.concat_tokens, + eos_text=original_args.eos_text, + bos_text=original_args.bos_text, + no_wrap=original_args.no_wrap, + compression=original_args.compression, + processes=original_args.processes, + ) + + return str(args) From a7c36bccf474441d5ab845fe0d91eca336e65d7f Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Wed, 10 Jan 2024 23:47:00 -0500 Subject: [PATCH 27/63] Minor cleanups (#858) * nits * logger * add log * lint --- llmfoundry/models/mpt/modeling_mpt.py | 4 ++-- llmfoundry/utils/config_utils.py | 14 +++----------- scripts/train/train.py | 6 +++++- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index e2274ffd6c..8b14c72f62 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -330,12 +330,12 @@ def __init__(self, config: MPTConfig): for module in self.modules(): if hasattr(module, 'bias') and isinstance( module.bias, nn.Parameter): - log.info(f'Removing bias ({module.bias}) from {module}.') + log.info(f'Removing bias from {module=}.') module.register_parameter('bias', None) # For transformer engine if hasattr(module, 'use_bias'): - log.info(f'Setting use_bias=False for {module}.') + log.info(f'Setting use_bias=False for {module=}.') module.use_bias = False log.debug(self) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 55576eaba0..29d78a0770 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -120,18 +120,10 @@ def process_init_device(model_cfg: DictConfig, fsdp_config: Optional[Dict]): # Set defaults for mixed initialization fsdp_config.setdefault('use_orig_params', False) fsdp_config.setdefault('load_monolith_rank0_only', True) - # Always set `sync_module_states` to True when using hybrid sharding - if fsdp_config is not None and \ - fsdp_config.get('sharding_strategy', 'FULL_SHARD') in ['HYBRID_SHARD', '_HYBRID_SHARD_ZERO2'] \ - and not fsdp_config.get('sync_module_states', False): - warnings.warn( - ('Setting `sync_module_states = True` for FSDP. This is required ' - 'when using hybrid sharding.')) - fsdp_config['sync_module_states'] = True - - # no mixed precision needed for weights when they're already 16 bits + + # No mixed precision needed for weights when they're already 16 bits master_dtype = model_cfg.get('master_weights_dtype') - small_dtypes = ('bf16', 'f16', 'float16', 'bfloat16', 'amp_fp16', + small_dtypes = ('bf16', 'fp16', 'float16', 'bfloat16', 'amp_fp16', 'amp_bf16') if fsdp_config and master_dtype in small_dtypes: reduce_dtype = None diff --git a/scripts/train/train.py b/scripts/train/train.py index 8c9fcc0291..c3da1f1d3c 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -438,13 +438,17 @@ def main(cfg: DictConfig) -> Trainer: format= f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' ) - logging.getLogger('llmfoundry').setLevel(python_log_level.upper()) + logging.getLogger('llmfoundry').setLevel( + python_log_level.upper()) # Foundry module + logging.getLogger(__name__).setLevel( + python_log_level.upper()) # Train script # Initialize context init_context = process_init_device(model_config, fsdp_config) logged_cfg.update({'fsdp_config': fsdp_config}, merge=True) # Build tokenizer + log.info('Building tokenizer...') tokenizer_name = tokenizer_config['name'] tokenizer_kwargs = tokenizer_config.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) From 55e46265d7c7c34552d469d15586df0f3bd53615 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Wed, 10 Jan 2024 23:25:49 -0800 Subject: [PATCH 28/63] update utils/__init__.py to include extra validation functions --- llmfoundry/utils/__init__.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py index 7abe4dcf75..fdf3d46e7e 100644 --- a/llmfoundry/utils/__init__.py +++ b/llmfoundry/utils/__init__.py @@ -13,6 +13,13 @@ update_batch_size_info) from llmfoundry.utils.model_download_utils import ( download_from_cache_server, download_from_hf_hub) + + from llmfoundry.utils.validation_utils import ( + create_om_cfg, token_counts_and_validation, + check_HF_datasets, is_hf_dataset_path, is_uc_delta_table, + pandas_processing_fn, integrity_check, convert_text_to_mds, + _args_str) + except ImportError as e: raise ImportError( 'Please make sure to pip install . to get requirements for llm-foundry.' @@ -34,4 +41,13 @@ 'update_batch_size_info', 'log_config', 'pop_config', + 'create_om_cfg', + 'token_counts_and_validation', + 'check_HF_datasets', + 'is_hf_dataset_path', + 'is_uc_delta_table', + 'pandas_processing_fn', + 'integrity_check', + 'convert_text_to_mds', + '_args_str', ] From 45544a1e83e35510248df9f6b6226f4f22fbdd23 Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Thu, 11 Jan 2024 08:02:45 +0000 Subject: [PATCH 29/63] update notebook --- .../data_prep/validate_and_tokenize_data.py | 1105 +++++++---------- 1 file changed, 480 insertions(+), 625 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 3b6c109199..8f96561e84 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -11,37 +11,22 @@ # COMMAND ---------- # MAGIC %md -# MAGIC ## Warning: Important Alert Regarding the Script Usage +# MAGIC # Warning: Important Alert Regarding the Script Usage # MAGIC -# MAGIC ### Script Purpose: +# MAGIC #### Usage Scenario: +# MAGIC This script is particularly designed for Databricks' customers who have access to Databricks notebook and UC. Our customers may find this script useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process. +# MAGIC +# MAGIC #### Script Purpose: # MAGIC - **Not for Training**: This script is not utilized during the training process. # MAGIC - **Ad-Hoc Validation**: It serves as an ad-hoc utility for users to run independently prior to starting fine-tuning. # MAGIC - **Data Verification**: Its primary function is to validate the user's data before they invoke the Fine-Tuning (FT) API. # MAGIC - **Cost Estimation**: Users can estimate the cost implications with this script. # MAGIC -# MAGIC ### Usage Scenario: -# MAGIC This script is particularly useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process. -# MAGIC -# MAGIC ### Note on Long-Term Solution: +# MAGIC #### Note on Long-Term Solution: # MAGIC - **Temporary Measure**: This script is a stop-gap solution. # MAGIC - **Future Development**: We are in the process of developing a long-term data preparation service, which will eventually replace this script. # MAGIC -# MAGIC ### Checks Include: -# MAGIC - check input dataset: -# MAGIC 1) verify if dataset input format is valid (need to be one of these: Huggingface, delta table, dbfs:/Volumes, cloud path); -# MAGIC - check HF input location: -# MAGIC 1) load dataset info and check if it is accessible; -# MAGIC 2) verify if the split exists. -# MAGIC - check cloud path location: -# MAGIC 1) check the cloud prefix is compliant with composers' object store supports (gs, s3, oci) -# MAGIC 2) check if list objects returns nothing. -# MAGIC - count_tokens: -# MAGIC 1) For IFT task: validate tokenization by running tokenizer + filter on the entire dataset. count the number of tokens. Throws error if there are any empty responses or prompts -# MAGIC 2) For CPT task: call donwload_text_to_mds.py and count the resulted mds dataset. Note this could take a long time. -# MAGIC -# MAGIC ### Questions: -# MAGIC - Is "download_text_to_mds.py" always callable from the validation script? -# MAGIC - what is the function to reuse to run tokenization on HF datasets with filters? +# MAGIC #### User Defines: # MAGIC - The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption? # MAGIC ``` # MAGIC cfg = { @@ -62,66 +47,285 @@ # MAGIC timeout: Optional[float] = 10, # MAGIC future: Literal[False] = False, # MAGIC } -# MAGIC - What null checkings do we want to have? -# MAGIC - How to map the model to its expected eos_text / bos_text format? [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703644153357929?thread_ts=1703643155.904289&cid=C05K29T9NBF) -# MAGIC - How to automate tokenization for CPT? it is always really standard: sequence -> concat(tok(BOS), tok(sequence), tok(EOS)), and then concatenate sequences. [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703698056000399?thread_ts=1703643155.904289&cid=C05K29T9NBF) -# MAGIC ``` +# MAGIC ``` +# MAGIC +# MAGIC #### Checks Include: +# MAGIC - check input dataset: +# MAGIC 1) verify if dataset input format is valid (need to be one of these: Huggingface, delta table, dbfs:/Volumes, cloud path); +# MAGIC - check HF input location: +# MAGIC 1) load dataset info and check if it is accessible; +# MAGIC 2) verify if the split exists. +# MAGIC - check cloud path location: +# MAGIC 1) check the cloud prefix is compliant with composers' object store supports (gs, s3, oci) +# MAGIC 2) check if list objects returns nothing. +# MAGIC - count_tokens: +# MAGIC 1) For IFT task: validate tokenization by running tokenizer + filter on the entire dataset. count the number of tokens. Throws error if there are any empty responses or prompts +# MAGIC 2) For CPT task: call donwload_text_to_mds.py and count the resulted mds dataset. Note this could take a long time. +# MAGIC +# MAGIC #### To-dos: +# MAGIC - Map the model to its expected eos_text / bos_text format automatically [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703644153357929?thread_ts=1703643155.904289&cid=C05K29T9NBF) +# MAGIC - Automate tokenization for CPT. it is always really standard: sequence -> concat(tok(BOS), tok(sequence), tok(EOS)), and then concatenate sequences. [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703698056000399?thread_ts=1703643155.904289&cid=C05K29T9NBF) +# MAGIC - Add ``preprocessing_fn`` here. -- We don't need to. FT API does not expose preprocessing_fn. +# MAGIC - Add a sample_ratio parameter so users can run the validation on a portion of the whole dataest then estimate by the scaling factor. +# MAGIC - Put utility function in a validation branch. +# MAGIC - + +# COMMAND ---------- + +# %pip install git+https://github.com/mosaicml/llm-foundry.git@byod/data_validation +%pip install git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation + +# COMMAND ---------- + +dbutils.library.restartPython() # COMMAND ---------- -# MAGIC %pip install llm-foundry +# MAGIC %md +# MAGIC # Instruction Fine Tuning # COMMAND ---------- -# dbutils.library.restartPython() +# MAGIC %md +# MAGIC #### All Utility Functions # COMMAND ---------- import os import re +import json +import tempfile +import numpy as np +import pandas as pd +from collections import defaultdict +from omegaconf import OmegaConf as om from argparse import ArgumentParser, Namespace -from typing import Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union -from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, - parse_uri) +import datasets from datasets import get_dataset_split_names from huggingface_hub import dataset_info -from omegaconf import OmegaConf as om +from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) from llmfoundry.utils import build_tokenizer +from llmfoundry.utils import (create_om_cfg, token_counts_and_validation, + check_HF_datasets, is_hf_dataset_path, is_uc_delta_table, + pandas_processing_fn, integrity_check, convert_text_to_mds, + _args_str) +from llmfoundry.data import ConcatTokensDataset + +from streaming.base.storage.download import download_file +from streaming.base.storage.upload import CloudUploader +from streaming.base.converters import dataframe_to_mds + + +# def create_om_cfg(FT_API_args: Namespace): +# task_type = FT_API_args.task_type + +# train_data_path = FT_API_args.train_data_path +# split = 'train' + +# if is_hf_dataset_path(FT_API_args.train_data_path): +# train_data_path, split = '/'.join(FT_API_args.train_data_path.split('/')[:2]), FT_API_args.train_data_path.split('/')[-1] + +# model = FT_API_args.model +# max_seq_len = FT_API_args.context_length + +# common_args = { +# 'drop_last': False, +# 'num_workers': 2, +# 'prefetch_factor': 2, +# 'pin_memory': False, +# 'persistent_workers': False, +# 'timeout': 0 +# } +# if task_type == 'INSTRUCTION_FINETUNE': +# cfg = om.create({ +# 'dataset': { +# 'hf_name': train_data_path, +# 'split': split, +# 'max_seq_len': max_seq_len, +# 'decoder_only_format': True, +# 'allow_pad_trimming': False, +# 'shuffle': True, +# }, +# **common_args +# }) + +# else: +# cfg = om.create({ +# 'name': 'finetuning', +# 'dataset': { +# 'remote': train_data_path, +# 'local': train_data_path, +# 'split': split, +# 'max_seq_len': max_seq_len, +# 'decoder_only_format': True, +# 'allow_pad_trimming': False, +# 'packing_ratio': None, +# 'shuffle': True, +# }, +# **common_args +# }) + +# tokenizer = build_tokenizer( +# tokenizer_name=model, +# tokenizer_kwargs={'model_max_length': max_seq_len}, +# ) + +# return cfg, tokenizer + +# def token_counts_and_validation(FT_API_args): +# from llmfoundry.data.finetuning import build_finetuning_dataloader + +# cfg, tokenizer = create_om_cfg(FT_API_args) + +# device_batch_size = 1 +# dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) +# dataloader = dataspec.dataloader +# token_counting_func = dataspec.get_num_tokens_in_batch + +# total_tokens = [] +# for batch in dataloader: +# n_batch_tokens = token_counting_func(batch) +# if n_batch_tokens == 0: +# raise ValueError("Empty train sample") +# total_tokens.append(n_batch_tokens) +# return total_tokens + +# #---------------------------------------- IFT ---------------------------------------- # + +# def check_HF_datasets(dataset_names_with_splits: list): +# token = os.environ.get('HUGGING_FACE_HUB_TOKEN') +# for dataset_name_with_split in dataset_names_with_splits: +# dataset_name, split = os.path.split(dataset_name_with_split) +# # make sure we have a dataset and split +# if not dataset_name or not split: +# return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." +# # check user access to the dataset +# try: +# _ = dataset_info(dataset_name) +# except: +# token_warning = '' +# if not token: +# token_warning = ' If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf.' +# return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning +# # check that split exists +# try: +# splits = get_dataset_split_names(dataset_name) +# except: # error raised in the case of multiple subsets +# return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets.' +# if split not in splits: +# return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found.' +# return True, '' + + +# def is_hf_dataset_path(path: str): +# """Check if a given string is a dataset path used by Hugging Face. + +# Args: +# path (str): The string to be checked. + +# Returns: +# bool: True if the string is a dataset path, False otherwise. +# """ +# # Regular expression to match the dataset path pattern +# pattern = r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$' + +# return bool(re.match(pattern, path)) + +# def is_uc_delta_table(name: str): +# """name is in the form of catalog.scheme.tablename + +# Args: +# name (str): a string folder/file/table path +# Return: +# (bool): True if name is valid UC delta table format +# """ +# return '.' in name and '/' not in name and '\\' not in name and len(name.split('.'))==3 + +# #---------------------------------------- CPT ---------------------------------------- # + +# def pandas_processing_fn(df: pd.DataFrame, +# **args: Any) -> Iterable[Dict[str, bytes]]: +# """Tokenize helper function for dataframe_to_mds. + +# Args: +# df (pandas.DataFrame): The input pandas DataFrame that needs to be processed. +# **args : Additional arguments to be passed to the 'process_some_data' function during processing. + +# Returns: +# iterable obj +# """ +# hf_dataset = hf_datasets.Dataset.from_pandas(df=df) +# tokenizer = AutoTokenizer.from_pretrained(args['tokenizer']) +# tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace +# dataset = ConcatTokensDataset( +# hf_dataset=hf_dataset, +# max_length=args.get('concat_tokens', None), +# tokenizer=tokenizer, +# eos_text=args.get('eos_text', None), +# bos_text=args.get('bos_text', None), +# no_wrap=args.get('no_wrap', None), +# ) + +# for sample in dataset: # pyright: ignore +# yield sample + +# def integrity_check(out: Union[str, Tuple[str, str]]): +# """Check if the index file has integrity. + +# If index is a cloud url, first download it to a temp local file. + +# Args: +# out (Union[str, Tuple[str,str]]): MDS dataset path +# """ + +# def count_shards(mds_root: str): +# n_shard_files = 0 +# cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) +# for o in cu.list_objects(): +# if o.endswith('.mds'): +# n_shard_files += 1 +# return n_shard_files + +# cu = CloudUploader.get(out, keep_local=True, exist_ok=True) + +# with tempfile.TemporaryDirectory() as temp_dir: +# if cu.remote: +# download_file(os.path.join(cu.remote, 'index.json'), +# os.path.join(temp_dir, 'index.json'), +# timeout=60) +# actual_n_shard_files = count_shards(cu.remote) +# local_merged_index_path = os.path.join(temp_dir, 'index.json') +# else: +# local_merged_index_path = os.path.join(cu.local, 'index.json') +# actual_n_shard_files = count_shards(cu.local) + +# merged_index = json.load(open(local_merged_index_path, 'r')) +# n_shard_files = len( +# {b['raw_data']['basename'] for b in merged_index['shards']}) +# return n_shard_files == actual_n_shard_files + # COMMAND ---------- -# MAGIC %md -# MAGIC ## User Defines the Cell Below +# MAGIC %md +# MAGIC #### User Defines +# MAGIC Use the same input arguments you will want to provide to FT API # COMMAND ---------- FT_API_args = Namespace( model='EleutherAI/gpt-neox-20b', - train_data_path= - 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', - save_folder= - 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', + train_data_path= 'tatsu-lab/alpaca/train', # 'main.streaming.random_large_table', # # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', + save_folder= 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', task_type='INSTRUCTION_FINETUNE', - eval_data_path=None, - eval_prompts=None, - custom_weights_path=None, - training_duration=None, - learning_rate=None, + training_duration=3, context_length=2048, - experiment_trackers=None, - disable_credentials_check=None, - # Extra argument to add to FT API - # See comment https://databricks.atlassian.net/browse/STR-141?focusedCommentId=4308948 - data_prep_config={ - 'data_validation': True, - 'data_prep': False - }, - timeout=10, - future=False, ) +temporary_jsonl_data_path = '/tmp/ft_data/train/' os.environ['HF_ASSETS_CACHE'] = '/tmp/' os.environ['HF_HOME'] = '/tmp/' os.environ['HF_HUB_CACHE'] = '/tmp/' @@ -130,602 +334,253 @@ # COMMAND ---------- # MAGIC %md -# MAGIC ## Adapted from llmfoundry/scripts/data_prep/convert_text_to_mds.py +# MAGIC #### Data Loading +# MAGIC +# MAGIC The IFT data needs to stay with a format +# MAGIC ``` +# MAGIC prompt: xxx +# MAGIC response or completion: yyy +# MAGIC ``` +# MAGIC +# MAGIC Based on FT_API_args.train_data_path, we will select an ingestion method from three options. +# MAGIC +# MAGIC - Option-1. Your data is a JSONL file which stores in an object store supported by Composer. [Example file to-be-added](todo - add a link to such a file) +# MAGIC - Option-2. You provide a Huggingface dataset ID. Note you need to provide a split as well. [Example dataset link to-be-added](huggingface.co) +# MAGIC - Option-3. You have a delta table. # COMMAND ---------- -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 +raw_dataset = None -# Taken from llmfoundry/scripts/data_prep/convert_text_to_mds.py +if FT_API_args.train_data_path.endswith('.jsonl') and os.path.exists(FT_API_args.train_data_path): + data_path = FT_API_args.train_data_path + raw_dataset = datasets.load_dataset('json', data_path) -import logging -import math -import tempfile -from argparse import Namespace -from concurrent.futures import ProcessPoolExecutor -from glob import glob -from typing import Iterable, List, Tuple, cast +if is_hf_dataset_path(FT_API_args.train_data_path): + check_HF_datasets(FT_API_args.train_data_path) + dataset_id, split = '/'.join(FT_API_args.train_data_path.split('/')[:2]), FT_API_args.train_data_path.split('/')[-1] + raw_dataset = datasets.load_dataset(dataset_id, split=split) -from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, - parse_uri) -from streaming import MDSWriter -from tqdm import tqdm -from transformers import AutoTokenizer +if is_uc_delta_table(FT_API_args.train_data_path): + delta_table_name = FT_API_args.train_data_path + df = spark.read.table(delta_table_name) + df = df.toPandas() + df.rename(columns={'prompts': 'prompt', 'responses': 'response'}, inplace=True) + df.to_json(os.path.join(temporary_jsonl_data_path, 'ift.jsonl'), orient='records', lines=True) + raw_dataset = datasets.Dataset.from_pandas(df) + FT_API_args.train_data_path = temporary_jsonl_data_path -from llmfoundry.data import ConcatTokensDataset -from llmfoundry.utils.data_prep_utils import (DownloadingIterable, - merge_shard_groups) - -log = logging.getLogger(__name__) -DONE_FILENAME = '.text_to_mds_conversion_done' - - -def parse_args( - tokenizer: str, - concat_tokens: int, - output_folder: str, - input_folder: str, - compression: str = 'zstd', - bos_text: str = '', - eos_text: str = '', - no_wrap: bool = False, - processes: int = 32, # min(max(psutil.cpu_count() - 2, 1), 32), - reprocess: bool = False -) -> Namespace: - - parser = ArgumentParser( - description= - 'Convert text files into MDS format, optionally concatenating and tokenizing', - ) - parsed = Namespace(tokenizer=tokenizer, - concat_tokens=concat_tokens, - output_folder=output_folder, - input_folder=input_folder, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - compression=compression, - processes=processes, - reprocess=reprocess) - - # Make sure we have needed concat options - if (parsed.concat_tokens is not None and - isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None): - parser.error( - 'When setting --concat_tokens, you must specify a --tokenizer') - - # now that we have validated them, change BOS/EOS to strings - if parsed.bos_text is None: - parsed.bos_text = '' - if parsed.eos_text is None: - parsed.eos_text = '' - return parsed - - -def get_object_names(input_folder: str) -> List[str]: - """Get object names from a local or remote folder. - - Args: - input_folder (str): local or remote folder path. - """ - object_store = maybe_create_object_store_from_uri(input_folder) - if object_store is not None: - _, _, folder_prefix = parse_uri(input_folder) - names = [ - name for name in object_store.list_objects(folder_prefix) - if name.endswith('.txt') - ] - else: - # input_folder is a local folder - names = [ - text_file for dirpath, _, _ in os.walk(input_folder) - for text_file in glob(os.path.join(dirpath, '*.txt')) - ] - # return names, sizes - log.info(f'Found {len(names)} text files at {input_folder}') - - return names - - -def get_task_args( - object_names: List[str], - output_root: str, - input_folder: str, - n_groups: int, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, -) -> Iterable: - """Get download_and_convert arguments split across n_groups. - - Each group handles a portion of object_names. - - Args: - object_names (List[str]): Names of objects to process - output_root (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - n_groups (int): Number of groups to split the object names into - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - """ - num_objects = len(object_names) - objs_per_group = math.ceil(num_objects / n_groups) - for group, i in enumerate(range(0, num_objects, objs_per_group)): - output_subdir = os.path.join(output_root, str(group)) - yield ( - object_names[i:min(i + objs_per_group, num_objects)], - output_subdir, - input_folder, - tokenizer_name, - concat_tokens, - eos_text, - bos_text, - no_wrap, - compression, - ) - - -def download_and_convert_starargs(args: Tuple): - """Helper function to call download_and_convert with star args. - - This helps us use download_and_convert with mutiprocessing. - """ - return download_and_convert(*args) - - -def download_and_convert( - file_names: List[str], - output_folder: str, - input_folder: str, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, -): - """Downloads and converts text fies to MDS format. - - Args: - file_names (List[str]): Files to process - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - """ - object_store = maybe_create_object_store_from_uri(input_folder) - - # Download file_names - with tempfile.TemporaryDirectory() as tmp_dir: - downloading_iter = DownloadingIterable(object_names=file_names, - output_folder=tmp_dir, - object_store=object_store) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace - - # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up - # to the maximum sequence length - dataset = ConcatTokensDataset( - hf_dataset=downloading_iter, - max_length=concat_tokens, - tokenizer=tokenizer, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - ) - - columns = {'tokens': 'bytes'} - - log.info('Converting to MDS format...') - with MDSWriter(out=output_folder, - columns=columns, - compression=compression) as out: - for sample in tqdm(dataset): - out.write(sample) - - -def is_remote_path(path: str) -> bool: - """Checks whether a path is a remote path. - - Args: - path (str): path to check - """ - backend, _, _ = parse_uri(path) - return backend != '' - - -def is_already_processed(output_root: str, args_str: str, - object_names: List[str]) -> bool: - """Determines whether a group of text files has already been processed. - - Checks the done fie at output root to determine this. - - Args: - output_root (str): Output folder where a done file may exist - args_str (str): String representation of the arguments - object_names (List[str]): Names of objects to convert to MDS format - """ - # Retrieve the done file contents - output_object_store = maybe_create_object_store_from_uri(output_root) - if output_object_store is not None: - # Download and read the done file from the remote object store - _, _, output_folder_prefix = parse_uri(output_root) - try: - with tempfile.TemporaryDirectory() as tmp_dir: - done_file = os.path.join(tmp_dir, DONE_FILENAME) - output_object_store.download_object( - os.path.join(output_folder_prefix, DONE_FILENAME), - done_file) - with open(done_file) as df: - done_file_contents = df.read().splitlines() - except FileNotFoundError: - return False - else: - # Read the local done file - done_file = os.path.join(output_root, DONE_FILENAME) - if not os.path.isfile(done_file): - return False - with open(done_file) as df: - done_file_contents = df.read().splitlines() - # Compare the arguments - prev_args_str = done_file_contents[0] - if prev_args_str != args_str: - return False - - # Compare file names - prev_names = done_file_contents[1:] - if len(prev_names) != len(object_names): - return False - for idx, prev_name in enumerate(prev_names): - if object_names[idx] != prev_name: - return False - return True - - -def write_done_file(folder: str, args_str: str, object_names: List[str]): - """Write a file to signify completion. - - This the done file includes the arguments to processing and - a list of objects that were processed. - - Args: - folder (str): Folder to write the done file to - args_str (str): String representation of arguments - object_names (List[str]): List of objects to convert to MDS format - """ - with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: - done_file.write('\n'.join([args_str] + object_names) + '\n') - - -def convert_text_to_mds( - tokenizer_name: str, - output_folder: str, - input_folder: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, - processes: int, - args_str: str, - reprocess: bool, -): - """Convert a folder of text files to MDS format. - - Args: - tokenizer_name (str): Name of tokenizer to use - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - processes (int): The number of processes to use. - args_str (str): String representation of the arguments - reprocess (bool): Whether to always reprocess the given folder of text files - """ - is_remote_output = is_remote_path(output_folder) - - object_names = get_object_names(input_folder) - if len(object_names) == 0: - raise ValueError(f'No text files were found at {input_folder}.') - - # Check if the text files in the bucket have already been processed. - if not reprocess and is_already_processed(output_folder, args_str, - object_names): - log.info( - f'Input folder {input_folder} is already processed at {output_folder} and ' - + - 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.' - ) - return - - # Use a temporary local directory if the output is remote and there are more than 1 processes - local_output_folder = tempfile.TemporaryDirectory( - ).name if is_remote_output else output_folder - - if processes > 1: - # Download and convert the text files in parallel - args = get_task_args(object_names, local_output_folder, input_folder, - processes, tokenizer_name, concat_tokens, eos_text, - bos_text, no_wrap, compression) - with ProcessPoolExecutor(max_workers=processes) as executor: - list(executor.map(download_and_convert_starargs, args)) - - # Merge the mds shards from each of the processes into a single folder - merge_shard_groups(local_output_folder) - else: - download_and_convert(object_names, local_output_folder, input_folder, - tokenizer_name, concat_tokens, eos_text, bos_text, - no_wrap, compression) - - # Write a done file with the args and object names - write_done_file(local_output_folder, args_str, object_names) - - if is_remote_output: - # Upload the local output to the remote location - output_object_store = cast( - ObjectStore, maybe_create_object_store_from_uri(output_folder)) - _, _, output_folder_prefix = parse_uri(output_folder) - files_to_upload = os.listdir(local_output_folder) - - for file in files_to_upload: - assert not os.path.isdir(file) - remote_path = os.path.join(output_folder_prefix, file) - output_object_store.upload_object( - remote_path, os.path.join(local_output_folder, file)) - - -def _args_str(original_args: Namespace) -> str: - """Create a string from the args to determine whether to reprocess. - - Args: - original_args (Namespace): Arguments to main function. - """ - # Take the arguments that influence the final result. - # reprocess and max_mds_writer_workers are not taken. - args = Namespace( - tokenizer_name=original_args.tokenizer, - output_folder=original_args.output_folder, - input_folder=original_args.input_folder, - concat_tokens=original_args.concat_tokens, - eos_text=original_args.eos_text, - bos_text=original_args.bos_text, - no_wrap=original_args.no_wrap, - compression=original_args.compression, - processes=original_args.processes, - ) - - return str(args) +if raw_dataset is None: + raise RuntimeError("Can't find a proper ingestion method") + +# COMMAND ---------- +!mkdir -p {temporary_jsonl_data_path} # COMMAND ---------- # MAGIC %md -# MAGIC ## Validate Inputs and Count tokens +# MAGIC #### Validation and Statistics # COMMAND ---------- -import json +# Initial dataset stats +print("Num examples:", len(raw_dataset)) +print("First example:") +for ex in raw_dataset: + print(ex) + print() + break + +format_errors = defaultdict(int) + +for ex in raw_dataset: + if not isinstance(ex, dict): + format_errors["data_type"] += 1 + continue + + prompts = ex.get("prompt", None) + if not prompts: + format_errors["missing_prompt"] += 1 + continue + + responses = ex.get("response", None) + if not responses: + format_errors["missing_response"] += 1 + continue + +if format_errors: + print("Oops! Found errors:") + for k, v in format_errors.items(): + print(f"{k}: {v}") +else: + print("Congratulations! No errors found") -from streaming.base.storage.download import download_file -from streaming.base.storage.upload import CloudUploader +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Cost Estimation +# MAGIC +# MAGIC Tokenize the raw dataset and we see some statistics of the tokens and estimate the overall cost based on default trainining duration +# COMMAND ---------- + +MAX_TOKENS_PER_EXAMPLE = FT_API_args.context_length if FT_API_args.context_length is not None else 4096 +TARGET_EPOCHS = FT_API_args.training_duration if FT_API_args.training_duration is not None else 1 +n_epochs = TARGET_EPOCHS +n_train_examples = len(raw_dataset) -def integrity_check(out: Union[str, Tuple[str, str]]): - """Check if the index file has integrity. - - If index is a cloud url, first download it to a temp local file. - - Args: - out (Union[str, Tuple[str,str]]): MDS dataset path - """ - - def count_shards(mds_root: str): - n_shard_files = 0 - cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) - for o in cu.list_objects(): - if o.endswith('.mds'): - n_shard_files += 1 - return n_shard_files - - cu = CloudUploader.get(out, keep_local=True, exist_ok=True) - - with tempfile.TemporaryDirectory() as temp_dir: - if cu.remote: - download_file(os.path.join(cu.remote, 'index.json'), - os.path.join(temp_dir, 'index.json'), - timeout=60) - actual_n_shard_files = count_shards(cu.remote) - local_merged_index_path = os.path.join(temp_dir, 'index.json') - else: - local_merged_index_path = os.path.join(cu.local, 'index.json') - actual_n_shard_files = count_shards(cu.local) - - merged_index = json.load(open(local_merged_index_path, 'r')) - n_shard_files = len( - {b['raw_data']['basename'] for b in merged_index['shards']}) - return n_shard_files == actual_n_shard_files - - -def check_HF_datasets(dataset_names_with_splits: list): - token = os.environ.get('HUGGING_FACE_HUB_TOKEN') - for dataset_name_with_split in dataset_names_with_splits: - dataset_name, split = os.path.split(dataset_name_with_split) - # make sure we have a dataset and split - if not dataset_name or not split: - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." - # check user access to the dataset - try: - _ = dataset_info(dataset_name) - except: - token_warning = '' - if not token: - token_warning = ' If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf.' - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning - # check that split exists - try: - splits = get_dataset_split_names(dataset_name) - except: # error raised in the case of multiple subsets - return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets.' - if split not in splits: - return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found.' - return True, '' - - -def is_hf_dataset_path(path: str): - """Check if a given string is a dataset path used by Hugging Face. - - Args: - path (str): The string to be checked. - - Returns: - bool: True if the string is a dataset path, False otherwise. - """ - # Regular expression to match the dataset path pattern - pattern = r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$' - - return bool(re.match(pattern, path)) - - -def create_om_cfg(FT_API_args: Namespace): - task_type = FT_API_args.task_type - train_data_path = FT_API_args.train_data_path - model = FT_API_args.model - max_seq_len = FT_API_args.context_length - - common_args = { - 'drop_last': False, - 'num_workers': 2, - 'prefetch_factor': 2, - 'pin_memory': False, - 'persistent_workers': False, - 'timeout': 0 - } - if task_type == 'INSTRUCTION_FINETUNE': - cfg = om.create({ - 'dataset': { - 'hf_name': train_data_path, - 'split': 'train', - 'max_seq_len': max_seq_len, - 'decoder_only_format': True, - 'allow_pad_trimming': False, - 'shuffle': True, - }, - **common_args - }) - - else: - cfg = om.create({ - 'name': 'finetuning', - 'dataset': { - 'remote': train_data_path, - 'local': train_data_path, - 'split': 'train', - 'max_seq_len': max_seq_len, - 'decoder_only_format': True, - 'allow_pad_trimming': False, - 'packing_ratio': None, - 'shuffle': True, - }, - **common_args - }) - - tokenizer = build_tokenizer( - tokenizer_name=model, - tokenizer_kwargs={'model_max_length': max_seq_len}, - ) - - return cfg, tokenizer +batch_tokens = token_counts_and_validation(FT_API_args) +n_billing_tokens_in_dataset = sum(batch_tokens) +print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training") +print(f"By default, you'll train for {n_epochs} epochs on this dataset") +print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens") # COMMAND ---------- +# MAGIC %md +# MAGIC # Continued Pretrain + +# COMMAND ---------- -# build cfg from the inputs -def main(): - if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': - # check if train_data_path is a valid HF dataset url with splits. - if not is_hf_dataset_path(FT_API_args.train_data_path): - raise ValueError( - f'Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.' - ) - # load dataset.info and see if HF tokens are correctly set. - check_HF_datasets(FT_API_args.train_data_path) - - cfg, tokenizer = create_om_cfg(FT_API_args) - - elif FT_API_args.task_type == 'CONTINUED_PRETRAIN': - # check if train_data_path is a valid object store that composer supports - cfg, tokenizer = create_om_cfg(FT_API_args) - - input_folder = FT_API_args.train_data_path - output_folder = FT_API_args.save_folder - concat_tokens = FT_API_args.context_length - tokenizer_name = FT_API_args.model - - # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" - args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) - convert_text_to_mds(tokenizer_name=args.tokenizer, - output_folder=args.output_folder, - input_folder=args.input_folder, - concat_tokens=args.concat_tokens, - eos_text=args.eos_text, - bos_text=args.bos_text, - no_wrap=args.no_wrap, - compression=args.compression, - processes=args.processes, - reprocess=args.reprocess, - args_str=_args_str(args)) - - # Check if the MDS dataset is integral by checking index.json - if integrity_check(args.output_folder): - raise RuntimeError( - f'{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!' - ) - - print('Converted data for continnued pre-training was saved in: ', - args.output_folder) - - else: - raise ValueError( - f'task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!' - ) - # Run a few checks on resulted MDS datasets - # 1. no shards in output_folder - # 2. check shard completeness by downloading and inspecting index.json - - from llmfoundry.data.finetuning import build_finetuning_dataloader - tokenizer_name = 'EleutherAI/gpt-neox-20b' - tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} - tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - - device_batch_size = 1 - dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) - dataloader = dataspec.dataloader - token_counting_func = dataspec.get_num_tokens_in_batch - - total_tokens = 0 - for batch in dataloader: - total_tokens += token_counting_func(batch) - - print('Total number of tokens:', total_tokens) +# MAGIC %md +# MAGIC #### User Defines + +# COMMAND ---------- + +FT_API_args = Namespace( + model='EleutherAI/gpt-neox-20b', + train_data_path= 'dbfs:/xiaohan-test/test_cpt/', + save_folder= 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', + task_type='CONTINUED_PRETRAIN', + training_duration=3, + context_length=2048, +) +temporary_mds_output_path = '/tmp/xiaohan-test/test_mds' # COMMAND ---------- -if __name__ == '__main__': - main() +# MAGIC %md +# MAGIC #### Data Loading (from text to MDS) +# MAGIC +# MAGIC Copy [llmfoundry/scripts/data_prep/convert_text_to_mds.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/data_prep/convert_text_to_mds.py) here and run the cell below + +# COMMAND ---------- + +from convert_text_to_mds import convert_text_to_mds, parse_args, _args_str + +# check if train_data_path is a valid object store that composer supports +cfg, tokenizer = create_om_cfg(FT_API_args) + +input_folder = FT_API_args.train_data_path +output_folder = FT_API_args.save_folder +concat_tokens = FT_API_args.context_length +tokenizer_name = FT_API_args.model + +# Run convert_text_to_mds.py and dump MDS dataset to "save_folder" +args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) +convert_text_to_mds(tokenizer_name=args.tokenizer, + output_folder=temporary_mds_output_path, + input_folder=args.input_folder, + concat_tokens=args.concat_tokens, + eos_text=args.eos_text, + bos_text=args.bos_text, + no_wrap=args.no_wrap, + compression=args.compression, + processes=args.processes, + reprocess=args.reprocess, + args_str=_args_str(args)) + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Alternative: Delta Ingestion +# MAGIC Once you have credentials set up with dbutils.secret or init script, You can ingest the folder of txt files and have the schema automatically inferred. The result is a spark dataframe and can be converted to MDS while Streaming's utility + +# COMMAND ---------- + +dbutils.fs.ls(FT_API_args.train_data_path) + +output_location = FT_API_args.train_data_path + '/*.txt' +df = spark.sql("SELECT * FROM read_files('%s')" % output_location).withColumnRenamed('value', 'text') +df.show() + +mds_kwargs = { + 'out': temporary_mds_output_path, + 'columns': { + 'tokens': 'bytes' + }, + 'keep_local': True +} +udf_kwargs = { + 'concat_tokens': FT_API_args.context_length, + 'tokenizer': FT_API_args.model, + 'eos_text': '', + 'compression': 'zstd', + 'no_wrap': False, + 'bos_text': '', +} + +dataframe_to_mds(df, + merge_index=True, + mds_kwargs=mds_kwargs, + udf_iterable=pandas_processing_fn, + udf_kwargs=udf_kwargs) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Validation + +# COMMAND ---------- + +print("Num examples:", len(df)) +print("First example:") +for ex in df['text']: + print(ex) + print() + break + +if integrity_check(temporary_mds_output_path): + raise ValueError("MDS has not been created correctly. There are missing shards") + +# Sanity Check +import numpy as np +from streaming import StreamingDataset +tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) +tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace +dataset = StreamingDataset(local=mds_output_path, shuffle=False) +for i in range(5): + l = np.frombuffer(dataset[i]['tokens'], dtype=np.int64) + print(''.join(tokenizer.decode(l))) + print() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Cost Estimation + +# COMMAND ---------- + +MAX_TOKENS_PER_EXAMPLE = FT_API_args.context_length if FT_API_args.context_length is not None else 4096 +TARGET_EPOCHS = FT_API_args.training_duration if FT_API_args.training_duration is not None else 1 +n_epochs = TARGET_EPOCHS +n_train_examples = len(raw_dataset) + +batch_tokens = token_counts_and_validation(FT_API_args) +n_billing_tokens_in_dataset = sum(batch_tokens) + +print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training") +print(f"By default, you'll train for {n_epochs} epochs on this dataset") +print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens") + +# COMMAND ---------- + + + +# COMMAND ---------- + + From d2797b3552aad618b84a5df0fcf37be86ff8c371 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 00:43:05 -0800 Subject: [PATCH 30/63] update --- llmfoundry/utils/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py index fdf3d46e7e..3b92105751 100644 --- a/llmfoundry/utils/__init__.py +++ b/llmfoundry/utils/__init__.py @@ -18,7 +18,7 @@ create_om_cfg, token_counts_and_validation, check_HF_datasets, is_hf_dataset_path, is_uc_delta_table, pandas_processing_fn, integrity_check, convert_text_to_mds, - _args_str) + parse_args, _args_str) except ImportError as e: raise ImportError( @@ -49,5 +49,6 @@ 'pandas_processing_fn', 'integrity_check', 'convert_text_to_mds', + 'parse_args', '_args_str', ] From 756fdae2a8b75fe1abd70e73113d5a977f274846 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 01:07:53 -0800 Subject: [PATCH 31/63] update --- llmfoundry/utils/validation_utils.py | 35 +++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index dc5fa66242..03bed609a9 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -233,7 +233,40 @@ def count_shards(mds_root: str): DONE_FILENAME = '.text_to_mds_conversion_done' -def parse_args() -> Namespace: +def parse_args( tokenizer, + concat_tokens, + output_folder, + input_folder, + compression = 'zstd', + bos_text = '', + eos_text = '', + no_wrap = False , + processes = 32, + reprocess = False ) -> Namespace: + parsed = Namespace(tokenizer = tokenizer, + concat_tokens = model_max_length, + output_folder = output_folder, + input_folder = input_folder, + eos_text = eos_text, + bos_text = bos_text, + no_wrap = no_wrap, + compression = compression, + processes = processes, + reprocess = reprocess) + # Make sure we have needed concat options + if (parsed.concat_tokens is not None and + isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None): + parser.error( + 'When setting --concat_tokens, you must specify a --tokenizer') + # now that we have validated them, change BOS/EOS to strings + if parsed.bos_text is None: + parsed.bos_text = '' + if parsed.eos_text is None: + parsed.eos_text = '' + return parsed + + +def original_parse_args() -> Namespace: """Parse commandline arguments.""" parser = ArgumentParser( description= From 6de8c37f5476df726f7bf96b7a1e2c74e7612164 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 06:29:31 -0800 Subject: [PATCH 32/63] Read UC delta table (#773) * initial commit * use databricks-sql to read delta table and convert to json * update * update * update * add mocked unittest * Fix lints * update * update * restructure code * Add timer for optimizing * Add db-connect * add wrapper * update * add install dbconnect * update * update * patch dbconnect to allow multiple return formats * update * add arrow * use compression * clean up * Add cluster rt check * Fix lints * remove patch.py for CI * update * update * updat * update * fix tests * fix lint * update * update * Add more tests * update * update * update * change to download_json * update * fix lints * Add decompressed option for arrow * format json to jsonl * Add comments * Make cf_collect_type global option * fix comments * fix lints * fix comments * Fix lints * change to use workspaceclient * Add CPT support * Rewire method assignment logic * Fix bug in stripping https * Add tests for rewired method assignment logic * Fix lints * Fix lints * Removed logger set_level * Remove pyspark. It conflicts with databricks-connect * Update the comment * skip cluster version check when cluster_id is serverless * Add use_serverless flag * update tests with use_serverless flag * Fix lints --------- Co-authored-by: Xiaohan Zhang --- scripts/data_prep/convert_delta_to_json.py | 517 ++++++++++++++++++ setup.py | 5 +- .../data_prep/test_convert_delta_to_json.py | 304 ++++++++++ 3 files changed, 825 insertions(+), 1 deletion(-) create mode 100644 scripts/data_prep/convert_delta_to_json.py create mode 100644 tests/a_scripts/data_prep/test_convert_delta_to_json.py diff --git a/scripts/data_prep/convert_delta_to_json.py b/scripts/data_prep/convert_delta_to_json.py new file mode 100644 index 0000000000..8986849a42 --- /dev/null +++ b/scripts/data_prep/convert_delta_to_json.py @@ -0,0 +1,517 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import logging +import os +import re +import time +import urllib.parse +from argparse import ArgumentParser, Namespace +from collections import namedtuple +from concurrent.futures import ProcessPoolExecutor +from typing import Iterable, List, Optional, Tuple, Union +from uuid import uuid4 + +import google.protobuf.any_pb2 as any_pb2 +import lz4.frame +import pandas as pd +import pyarrow as pa +import pyspark.sql.connect.proto as pb2 +import pyspark.sql.connect.proto.cloud_pb2 as cloud_pb2 +import requests +from databricks import sql +from databricks.connect import DatabricksSession +from databricks.sdk import WorkspaceClient +from databricks.sql.client import Connection as Connection +from databricks.sql.client import Cursor as Cursor +from packaging import version +from pyspark.sql import SparkSession +from pyspark.sql.connect.client.core import SparkConnectClient +from pyspark.sql.connect.client.reattach import \ + ExecutePlanResponseReattachableIterator +from pyspark.sql.connect.dataframe import DataFrame +from pyspark.sql.dataframe import DataFrame as SparkDataFrame +from pyspark.sql.types import Row + +MINIMUM_DB_CONNECT_DBR_VERSION = '14.1.0' +MINIMUM_SQ_CONNECT_DBR_VERSION = '12.2.0' + +log = logging.getLogger(__name__) + +Result = namedtuple( + 'Result', ['url', 'row_count', 'compressed_size', 'uncompressed_size' + ]) # pyright: ignore + +# ``collect_as_cf`` is an addon new feature monkey patch on top of the DB Connect package. +# It allows the client to fetch the results in different formats from the server. +# To be able to use the code make sure this module is not overriden by DB Connect classes. + + +def to_cf(self: SparkConnectClient, + plan: pb2.Plan, + type: str = 'json') -> Tuple[List[Result], int, bool]: + """Executes the query plans and return as presigned URLS for cloud fetch. + + It can handle the current output formats that are supported by the server. + In contrast to the regular API methods of the client, this method does not + return the schema and drops all other responses. + + Args: + plan (pb2.Plan): The plan object to be executed by spark. + type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'. + + Returns: + Tuple[List[Result], int, bool]: A tuple containing: + - A list of Result namedtuples, each containing a URL, row count, compressed size, + and uncompressed size of the part of the result. + - Total row count of all parts of the result. + - A boolean indicating whether the result has been truncated. + """ + req = self._execute_plan_request_with_metadata() + req.plan.CopyFrom(plan) + + # Add the request options + if type == 'json': + format = cloud_pb2.ResultOptions.CloudOptions.FORMAT_JSON + elif type == 'csv': + format = cloud_pb2.ResultOptions.CloudOptions.FORMAT_CSV + elif type == 'arrow': + format = cloud_pb2.ResultOptions.CloudOptions.FORMAT_ARROW + else: + raise ValueError( + f'Only formats json, csv, and arrow are supported. Got invalid type {type}' + ) + + ro = cloud_pb2.ResultOptions( + type=cloud_pb2.ResultOptions.TYPE_CLOUD, + cloudOptions=cloud_pb2.ResultOptions.CloudOptions( + format=format, + useCompression=False, + )) + cloud_option = any_pb2.Any() + cloud_option.Pack(ro) + req.request_options.append( + pb2.ExecutePlanRequest.RequestOption(extension=cloud_option)) + + # Create the iterator + iterator = ExecutePlanResponseReattachableIterator(req, self._stub, + self._retry_policy, + self._builder.metadata()) + # Iterate over the response + result = [] + row_count = 0 + is_overflow = False + + for response in iterator: + if response.HasField('extension') and response.extension.Is( + cloud_pb2.CloudResultBatch.DESCRIPTOR): + batch = cloud_pb2.CloudResultBatch() + if not response.extension.Is(cloud_pb2.CloudResultBatch.DESCRIPTOR): + raise ValueError( + 'Response extension is not of type CloudResultBatch.') + response.extension.Unpack(batch) + result += [ + Result(b.url, b.row_count, b.compressed_size, + b.uncompressed_size) for b in batch.results + ] + row_count += sum(result.row_count for result in batch.results) + is_overflow |= batch.truncated + return result, row_count, is_overflow + + +SparkConnectClient.to_cf = to_cf # pyright: ignore + + +def collect_as_cf(self: DataFrame, + type: str = 'json') -> Tuple[List[Result], int, bool]: + """Collects DataFrame execution plan as presigned URLs. + + This method is a wrapper around the `to_cf` method of SparkConnectClient. It takes the + execution plan of the current DataFrame, converts it to a protocol buffer format, and then + uses the `to_cf` method to execute the plan and fetch results as presigned URLs. + + Args: + type (str): The output format of the result, supported formats are 'json', 'csv', and 'arrow'. + + Returns: + Tuple[List[Result], int, bool]: A tuple containing: + - A list of Result namedtuples, each containing a URL, row count, compressed size, + and uncompressed size of the part of the result. + - Total row count of all parts of the result. + - A boolean indicating whether the result is truncated or overflowed. + """ + query = self._plan.to_proto(self._session.client) # pyright: ignore + return self._session.client.to_cf(query, type) # pyright: ignore + + +DataFrame.collect_cf = collect_as_cf # pyright: ignore + + +def iterative_combine_jsons(json_directory: str, output_file: str) -> None: + """Combine jsonl files in json_directory into one big jsonl file. + + This function does not work for nested subdirectories. + + Args: + json_directory(str): directory containing the JSONL files + output_file(str): path to the output combined JSONL file + """ + json_files = [f for f in os.listdir(json_directory) if f.endswith('.jsonl')] + with open(output_file, 'w') as outfile: + for file_name in json_files: + with open(os.path.join(json_directory, file_name), 'r') as infile: + for line in infile: + outfile.write(line) + log.info('JSON files have been combined into a JSONL file.') + + +def run_query( + query: str, + method: str, + cursor: Optional[Cursor] = None, + spark: Optional[SparkSession] = None, + collect: bool = True +) -> Optional[Union[List[Row], DataFrame, SparkDataFrame]]: + """Run SQL query via databricks-connect or databricks-sql. + + Args: + query (str): sql query + method (str): select from dbsql and dbconnect + cursor (Optional[Cursor]): connection.cursor + spark (Optional[SparkSession]): spark session + collect (bool): whether to get the underlying data from spark dataframe + """ + if method == 'dbsql': + if cursor is None: + raise ValueError(f'cursor cannot be None if using method dbsql') + cursor.execute(query) + if collect: + return cursor.fetchall() + elif method == 'dbconnect': + if spark == None: + raise ValueError(f'sparkSession is required for dbconnect') + df = spark.sql(query) + if collect: + return df.collect() + return df + else: + raise ValueError(f'Unrecognized method: {method}') + + +def get_args(signed: List, json_output_path: str, columns: List) -> Iterable: + for i, r in enumerate(signed): + yield (i, r.url, json_output_path, columns) + + +def download(ipart: int, + url: str, + json_output_path: str, + columns: Optional[List] = None, + resp_format: str = 'arrow', + compressed: bool = False) -> None: + """Thread download presigned url and save to jsonl locally. + + Args: + ipart (int): presigned url id + url (str): presigned url + json_output_path (str): directory to save the ipart_th segment of dataframe + columns (list): schema to save to json + resp_format (str): whether to use arrow or json when collect + compressed (bool): if data is compressed before downloading. Need decompress if compressed=True. + """ + resp = requests.get(url) + if resp.status_code == 200: + if resp_format == 'json': + data = resp.json() + pd.DataFrame(data, columns=columns).to_json(os.path.join( + json_output_path, 'part_' + str(ipart) + '.jsonl'), + orient='records', + lines=True) + return + + # When resp_format is arrow: + if compressed: + # The data is lz4 compressed arrow format. + # Decompress the data + decompressed_data = lz4.frame.decompress(resp.content) + # Convert the decompressed data into a PyArrow table + reader = pa.ipc.open_stream(decompressed_data) + else: + reader = pa.ipc.open_stream(resp.content) + table = reader.read_all() + + # Convert the PyArrow table into a pandas DataFrame + df = table.to_pandas() + df.to_json(os.path.join(json_output_path, + 'part_' + str(ipart) + '.jsonl'), + orient='records', + lines=True, + force_ascii=False) + + +def download_starargs(args: Tuple) -> None: + return download(*args) + + +def fetch_data(method: str, cursor: Optional[Cursor], + sparkSession: Optional[SparkSession], start: int, end: int, + order_by: str, tablename: str, columns_str: str, + json_output_path: str) -> None: + """Fetches a specified range of rows from a given table to a json file. + + This function executes a SQL query to retrieve a range of rows, determined by 'start' and 'end' indexes, + from a specified table and column set. The fetched data is then exported as a JSON file. + + Args: + method (str): The method to use for fetching data, either 'dbconnect' or 'dbsql'. + cursor (Optional[Cursor]): The cursor object for executing queries in 'dbsql' method. + sparkSession (Optional[SparkSession]): The Spark session object for executing queries in 'dbconnect' method. + start (int): The starting index for row fetching. + end (int): The ending index for row fetching. + order_by (str): The column name to use for ordering the rows. + tablename (str): The name of the table from which to fetch the data. + columns_str (str): The string representation of the columns to select from the table. + json_output_path (str): The file path where the resulting JSON file will be saved. + + Returns: + None: The function doesn't return any value, but writes the result to a JSONL file. + """ + query = f""" + WITH NumberedRows AS ( + SELECT + *, + ROW_NUMBER() OVER (ORDER BY {order_by}) AS rn + FROM + {tablename} + ) + SELECT {columns_str} + FROM NumberedRows + WHERE rn BETWEEN {start+1} AND {end}""" + + if method == 'dbconnect': + spark_df = run_query(query, method, cursor, sparkSession, collect=False) + if spark_df is None: + raise RuntimeError( + f'Expect spark dataframe with {query} but got None') + pdf = spark_df.toPandas() # pyright: ignore + else: # method == 'dbsql': + ans = run_query(query, method, cursor, sparkSession, collect=True) + if ans is None: + raise RuntimeError(f'Got empty results with {query}') + records = [r.asDict() for r in ans] # pyright: ignore + pdf = pd.DataFrame.from_dict(records) + + pdf.to_json(os.path.join(json_output_path, f'part_{start+1}_{end}.jsonl'), + orient='records', + lines=True) + + +def fetch( + method: str, + tablename: str, + json_output_path: str, + batch_size: int = 1 << 30, + processes: int = 1, + sparkSession: Optional[SparkSession] = None, + dbsql: Optional[Connection] = None, +) -> None: + """Fetch UC delta table with databricks-connnect as JSONL. + + Args: + method (str): dbconnect or dbsql + tablename (str): catalog.scheme.tablename on UC + json_output_path (str): path to write the result json file to + batch_size (int): number of rows that dbsql fetches each time to avoid OOM + processes (int): max number of processes to use to parallelize the fetch + sparkSession (pyspark.sql.sparksession): spark session + dbsql (databricks.sql.connect): dbsql session + """ + cursor = dbsql.cursor() if dbsql is not None else None + + try: + ans = run_query(f'SELECT COUNT(*) FROM {tablename}', method, cursor, + sparkSession) + nrows = [row.asDict() for row in ans][0].popitem()[1] # pyright: ignore + log.info(f'total_rows = {nrows}') + except Exception as e: + raise RuntimeError( + f'Error in get total rows from {tablename}. Restart sparkSession and try again' + ) from e + + try: + ans = run_query(f'SHOW COLUMNS IN {tablename}', method, cursor, + sparkSession) + columns = [row.asDict().popitem()[1] for row in ans] # pyright: ignore + order_by = columns[0] + columns_str = ','.join(columns) + log.info(f'order by column {order_by}') + except Exception as e: + raise RuntimeError( + f'Error in get columns from {tablename}. Restart sparkSession and try again' + ) from e + + if method == 'dbconnect' and sparkSession is not None: + log.info('processes = ', processes) + df = sparkSession.table(tablename) + + # Running the query and collecting the data as arrow or json. + signed, _, _ = df.collect_cf('arrow') # pyright: ignore + log.info(f'len(signed) = {len(signed)}') + + args = get_args(signed, json_output_path, columns) + + # Stopping the SparkSession to avoid spilling connection state into the subprocesses. + sparkSession.stop() + + with ProcessPoolExecutor(max_workers=processes) as executor: + list(executor.map(download_starargs, args)) + + elif method == 'dbsql' and cursor is not None: + for start in range(0, nrows, batch_size): + log.warning(f'batch {start}') + end = min(start + batch_size, nrows) + fetch_data(method, cursor, sparkSession, start, end, order_by, + tablename, columns_str, json_output_path) + + if cursor is not None: + cursor.close() + + +def fetch_DT(args: Namespace) -> None: + """Fetch UC Delta Table to local as jsonl.""" + log.info(f'Start .... Convert delta to json') + + obj = urllib.parse.urlparse(args.json_output_path) + if obj.scheme != '': + raise ValueError( + f'Check the json_output_path and verify it is a local path!') + + if os.path.exists(args.json_output_path): + if not os.path.isdir(args.json_output_path) or os.listdir( + args.json_output_path): + raise RuntimeError( + f'A file or a folder {args.json_output_path} already exists and is not empty. Remove it and retry!' + ) + + os.makedirs(args.json_output_path, exist_ok=True) + + log.info(f'Directory {args.json_output_path} created.') + + method = 'dbsql' + dbsql = None + sparkSession = None + + if args.use_serverless: + method = 'dbconnect' + else: + w = WorkspaceClient() + res = w.clusters.get(cluster_id=args.cluster_id) + runtime_version = res.spark_version.split('-scala')[0].replace( + 'x-snapshot', '0').replace('x', '0') + if version.parse(runtime_version) < version.parse( + MINIMUM_SQ_CONNECT_DBR_VERSION): + raise ValueError( + f'The minium DBR version required is {MINIMUM_SQ_CONNECT_DBR_VERSION} but got {version.parse(runtime_version)}' + ) + + if args.http_path is None and version.parse( + runtime_version) >= version.parse( + MINIMUM_DB_CONNECT_DBR_VERSION): + method = 'dbconnect' + + if method == 'dbconnect': + try: + if args.use_serverless: + session_id = str(uuid4()) + sparkSession = DatabricksSession.builder.host( + args.DATABRICKS_HOST).token(args.DATABRICKS_TOKEN).header( + 'x-databricks-session-id', session_id).getOrCreate() + + else: + sparkSession = DatabricksSession.builder.remote( + host=args.DATABRICKS_HOST, + token=args.DATABRICKS_TOKEN, + cluster_id=args.cluster_id).getOrCreate() + + except Exception as e: + raise RuntimeError( + 'Failed to create databricks connection. Check hostname and access token!' + ) from e + else: + try: + dbsql = sql.connect( + server_hostname=re.compile(r'^https?://').sub( + '', args.DATABRICKS_HOST).strip( + ), # sqlconnect hangs if hostname starts with https + http_path=args.http_path, + access_token=args.DATABRICKS_TOKEN, + ) + except Exception as e: + raise RuntimeError( + 'Failed to create sql connection to db workspace. To use sql connect, you need to provide http_path and cluster_id!' + ) from e + + fetch(method, args.delta_table_name, args.json_output_path, args.batch_size, + args.processes, sparkSession, dbsql) + + if dbsql is not None: + dbsql.close() + + # combine downloaded jsonl into one big jsonl for IFT + iterative_combine_jsons( + args.json_output_path, + os.path.join(args.json_output_path, 'combined.jsonl')) + + +if __name__ == '__main__': + parser = ArgumentParser( + description= + 'Download delta table from UC and convert to json to save local') + parser.add_argument('--delta_table_name', + required=True, + type=str, + help='UC table ..') + parser.add_argument('--json_output_path', + required=True, + type=str, + help='Local path to save the converted json') + parser.add_argument('--http_path', + required=False, + type=str, + help='http_path is set then dbsql method is used') + parser.add_argument('--batch_size', + required=False, + type=int, + default=1 << 30, + help='row chunks to transmit a time to avoid OOM') + parser.add_argument('--processes', + required=False, + type=int, + default=os.cpu_count(), + help='number of processes allowed to use') + parser.add_argument( + '--cluster_id', + required=True, + type=str, + default=None, + help= + 'cluster id has runtime newer than 14.1.0 and access mode of either assigned or shared can use databricks-connect.' + ) + parser.add_argument( + '--use_serverless', + required=False, + type=bool, + default=False, + help= + 'Use serverless or not. Make sure the workspace is entitled with serverless' + ) + args = parser.parse_args() + + from databricks.sdk import WorkspaceClient + w = WorkspaceClient() + args.DATABRICKS_HOST = w.config.host + args.DATABRICKS_TOKEN = w.config.token + + tik = time.time() + fetch_DT(args) + log.info('Elapsed time', time.time() - tik) diff --git a/setup.py b/setup.py index 3de80f2292..5444352cf7 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,10 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.17.2,<0.18', + 'mosaicml[databricks]>=0.17.1,<0.18', + 'databricks-sql-connector>=3,<4', + 'databricks-connect==14.1.0', + 'lz4>=4,<5', ] extra_deps['tensorboard'] = [ diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py new file mode 100644 index 0000000000..39bc5d8099 --- /dev/null +++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py @@ -0,0 +1,304 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +# copyright 2022 mosaicml llm foundry authors +# spdx-license-identifier: apache-2.0 + +import unittest +from argparse import Namespace +from typing import Any +from unittest.mock import MagicMock, mock_open, patch + +from scripts.data_prep.convert_delta_to_json import (download, fetch_DT, + iterative_combine_jsons, + run_query) + + +class TestConverDeltaToJsonl(unittest.TestCase): + + @patch('scripts.data_prep.convert_delta_to_json.sql.connect') + @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') + @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') + @patch('scripts.data_prep.convert_delta_to_json.fetch') + @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') + def test_stream_delta_to_json(self, mock_workspace_client: Any, + mock_fetch: Any, mock_combine_jsons: Any, + mock_makedirs: Any, mock_sql_connect: Any): + + args = MagicMock() + args.delta_table_name = 'test_table' + args.json_output_path = '/path/to/jsonl' + args.DATABRICKS_HOST = 'test_host' + args.DATABRICKS_TOKEN = 'test_token' + args.http_path = 'test_path' + args.batch_size = 1000 + args.partitions = 1 + args.cluster_id = '1234' + args.debug = False + args.use_serverless = False + + mock_cluster_get = MagicMock() + mock_cluster_get.return_value = MagicMock( + spark_version='14.1.0-scala2.12') + mock_workspace_client.return_value.clusters.get = mock_cluster_get + + fetch_DT(args) + mock_sql_connect.assert_called_once_with(server_hostname='test_host', + http_path='test_path', + access_token='test_token') + mock_makedirs.assert_called_once_with('/path/to/jsonl', exist_ok=True) + mock_fetch.assert_called_once() + mock_combine_jsons.assert_called_once_with( + '/path/to/jsonl', '/path/to/jsonl/combined.jsonl') + + @patch('scripts.data_prep.convert_delta_to_json.os.listdir') + @patch('builtins.open', + new_callable=mock_open, + read_data='{"key": "value"}') + def test_iterative_combine_jsons(self, mock_file: Any, mock_listdir: Any): + mock_listdir.return_value = ['file1.jsonl', 'file2.jsonl'] + json_directory = '/fake/dir' + output_file = '/fake/output.jsonl' + + iterative_combine_jsons(json_directory, output_file) + + mock_listdir.assert_called_once_with(json_directory) + mock_file.assert_called() + """ + Diagnostic print + for call_args in mock_file().write.call_args_list: + print(call_args) + -------------------- + call('{') + call('"key"') + call(': ') + call('"value"') + call('}') + call('\n') + call('{') + call('"key"') + call(': ') + call('"value"') + call('}') + call('\n') + -------------------- + """ + self.assertEqual(mock_file().write.call_count, 2) + + @patch('scripts.data_prep.convert_delta_to_json.SparkSession') + def test_run_query_dbconnect(self, mock_spark: Any): + method = 'dbconnect' + mock_cursor = None + mock_spark.sql.return_value.collect.return_value = 'result' + + result = run_query('SELECT * FROM table', + method, + cursor=mock_cursor, + spark=mock_spark) + + mock_spark.sql.assert_called_once_with('SELECT * FROM table') + self.assertEqual(result, 'result') + + @patch('scripts.data_prep.convert_delta_to_json.Cursor') + def test_run_query_dbsql(self, mock_cursor: Any): + method = 'dbsql' + mock_cursor.fetchall.return_value = 'result' + mock_spark = None + + result = run_query('SELECT * FROM table', + method, + cursor=mock_cursor, + spark=mock_spark) + + mock_cursor.execute.assert_called_once_with('SELECT * FROM table') + self.assertEqual(result, 'result') + + @patch('scripts.data_prep.convert_delta_to_json.requests.get') + @patch('scripts.data_prep.convert_delta_to_json.pd.DataFrame.to_json') + @patch('scripts.data_prep.convert_delta_to_json.os.path.join', + return_value='/fake/path/part_1.jsonl') + @patch('scripts.data_prep.convert_delta_to_json.time.sleep' + ) # Mock sleep to speed up the test + def test_download_success(self, mock_sleep: Any, mock_join: Any, + mock_to_json: Any, mock_get: Any): + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = [['val1.1', 'val1.2'], + ['val2.1', 'val2.2']] + mock_get.return_value = mock_response + + download(1, + 'http://fakeurl.com/data', + '/fake/path', ['A', 'B'], + resp_format='json') + + mock_get.assert_called_with('http://fakeurl.com/data') + mock_join.assert_called_with('/fake/path', 'part_1.jsonl') + mock_to_json.assert_called_with('/fake/path/part_1.jsonl', + orient='records', + lines=True) + + mock_get.assert_called_once_with('http://fakeurl.com/data') + + @patch('scripts.data_prep.convert_delta_to_json.sql.connect') + @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') + @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') + @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') + @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') + @patch('scripts.data_prep.convert_delta_to_json.fetch') + def test_dbconnect_called(self, mock_fetch: Any, mock_combine_jsons: Any, + mock_makedirs: Any, mock_workspace_client: Any, + mock_databricks_session: Any, + mock_sql_connect: Any): + + args = MagicMock() + + args.delta_table_name = 'test_table' + args.json_output_path = '/path/to/jsonl' + # Execute function with http_path=None (should use dbconnect) + args.http_path = None + args.cluster_id = '1234' + args.DATABRICKS_HOST = 'host' + args.DATABRICKS_TOKEN = 'token' + args.use_serverless = False + + mock_cluster_response = Namespace(spark_version='14.1.0-scala2.12') + mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response + + mock_remote = MagicMock() + mock_remote.getOrCreate.return_value = MagicMock( + ) # Mock return value for getOrCreate + mock_databricks_session.builder.remote.return_value = mock_remote + + fetch_DT(args) + mock_databricks_session.builder.remote.assert_called_once_with( + host=args.DATABRICKS_HOST, + token=args.DATABRICKS_TOKEN, + cluster_id=args.cluster_id) + + @patch('scripts.data_prep.convert_delta_to_json.sql.connect') + @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') + @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') + @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') + @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') + @patch('scripts.data_prep.convert_delta_to_json.fetch') + def test_sqlconnect_called_dbr13(self, mock_fetch: Any, + mock_combine_jsons: Any, + mock_makedirs: Any, + mock_workspace_client: Any, + mock_databricks_session: Any, + mock_sql_connect: Any): + + args = MagicMock() + + args.delta_table_name = 'test_table' + args.json_output_path = '/path/to/jsonl' + # Execute function with http_path=None (should use dbconnect) + args.http_path = 'test_path' + args.cluster_id = '1234' + args.DATABRICKS_HOST = 'host' + args.DATABRICKS_TOKEN = 'token' + args.use_serverless = False + + mock_cluster_response = Namespace(spark_version='13.0.0-scala2.12') + mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response + + fetch_DT(args) + mock_sql_connect.assert_called_once_with( + server_hostname=args.DATABRICKS_HOST, + http_path=args.http_path, + access_token=args.DATABRICKS_TOKEN) + + @patch('scripts.data_prep.convert_delta_to_json.sql.connect') + @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') + @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') + @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') + @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') + @patch('scripts.data_prep.convert_delta_to_json.fetch') + def test_sqlconnect_called_dbr14(self, mock_fetch: Any, + mock_combine_jsons: Any, + mock_makedirs: Any, + mock_workspace_client: Any, + mock_databricks_session: Any, + mock_sql_connect: Any): + + args = MagicMock() + + args.delta_table_name = 'test_table' + args.json_output_path = '/path/to/jsonl' + # Execute function with http_path=None (should use dbconnect) + args.http_path = 'test_path' + args.cluster_id = '1234' + args.DATABRICKS_HOST = 'host' + args.DATABRICKS_TOKEN = 'token' + args.use_serverless = False + + mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12') + mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response + + fetch_DT(args) + mock_sql_connect.assert_called_once_with( + server_hostname=args.DATABRICKS_HOST, + http_path=args.http_path, + access_token=args.DATABRICKS_TOKEN) + + @patch('scripts.data_prep.convert_delta_to_json.sql.connect') + @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') + @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') + @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') + @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') + @patch('scripts.data_prep.convert_delta_to_json.fetch') + def test_sqlconnect_called_https(self, mock_fetch: Any, + mock_combine_jsons: Any, + mock_makedirs: Any, + mock_workspace_client: Any, + mock_databricks_session: Any, + mock_sql_connect: Any): + + args = MagicMock() + + args.delta_table_name = 'test_table' + args.json_output_path = '/path/to/jsonl' + # Execute function with http_path=None (should use dbconnect) + args.http_path = 'test_path' + args.cluster_id = '1234' + args.DATABRICKS_HOST = 'https://test-host' + args.DATABRICKS_TOKEN = 'token' + args.use_serverless = False + + mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12') + mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response + + fetch_DT(args) + mock_sql_connect.assert_called_once_with( + server_hostname='test-host', + http_path=args.http_path, + access_token=args.DATABRICKS_TOKEN) + + @patch('scripts.data_prep.convert_delta_to_json.sql.connect') + @patch('scripts.data_prep.convert_delta_to_json.DatabricksSession') + @patch('scripts.data_prep.convert_delta_to_json.WorkspaceClient') + @patch('scripts.data_prep.convert_delta_to_json.os.makedirs') + @patch('scripts.data_prep.convert_delta_to_json.iterative_combine_jsons') + @patch('scripts.data_prep.convert_delta_to_json.fetch') + def test_serverless(self, mock_fetch: Any, mock_combine_jsons: Any, + mock_makedirs: Any, mock_workspace_client: Any, + mock_databricks_session: Any, mock_sql_connect: Any): + + args = MagicMock() + + args.delta_table_name = 'test_table' + args.json_output_path = '/path/to/jsonl' + # Execute function with http_path=None (should use dbconnect) + args.http_path = 'test_path' + args.cluster_id = '1234' + args.DATABRICKS_HOST = 'https://test-host' + args.DATABRICKS_TOKEN = 'token' + args.use_serverless = True + + mock_cluster_response = Namespace(spark_version='14.2.0-scala2.12') + mock_workspace_client.return_value.clusters.get.return_value = mock_cluster_response + + fetch_DT(args) + assert not mock_sql_connect.called + assert not mock_databricks_session.builder.remote.called From 93b5a9f9fa6f2174d13dab6a05e4309f1b5cb30f Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 10:21:50 -0800 Subject: [PATCH 33/63] Add download remote function to util --- llmfoundry/utils/validation_utils.py | 81 ++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index 03bed609a9..dcbbe78ecd 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -665,3 +665,84 @@ def _args_str(original_args: Namespace) -> str: ) return str(args) + + + +from composer.utils import dist, get_file, parse_uri +from llmfoundry.data.finetuning.tasks import (DOWNLOADED_FT_DATASETS_DIRPATH, + SUPPORTED_EXTENSIONS, + dataset_constructor) + +def _download_remote_hf_dataset(remote_path: str, split: str) -> str: + """Downloads a dataset from a remote object store. + + This function supports 'jsonl', 'csv', and 'parquet' file formats for the dataset. It will attempt to download + the dataset, then once it is downloaded, convert it into HuggingFace ``datasets`` format, and then return this + dataset. + + The function also ensures synchronicity across multiple processes during the file download. It creates a signal + file that is used to synchronize the start of the download across different processes. Once the download is + completed, the function removes the signal file. + + Args: + hf_name (str): The path of the HuggingFace dataset to download. + split (str): The dataset split to download (e.g., 'train', 'validation', 'test'). + + Returns: + A local directory path where the dataset files are stored. + + Raises: + FileNotFoundError: Raised if the dataset file cannot be found with any of the supported extensions. + """ + finetune_dir = os.path.join( + DOWNLOADED_FT_DATASETS_DIRPATH, + split if split != 'data' else 'data_not', + ) + os.makedirs(finetune_dir, exist_ok=True) + for extension in SUPPORTED_EXTENSIONS: + name = f'{remote_path.strip("/")}/{split}{extension}' + destination = str( + os.path.abspath( + os.path.join(finetune_dir, 'data', + f'{split}-00000-of-00001{extension}'))) + + # Since we don't know exactly what the extension will be, since it is one of a list + # use a signal file to wait for instead of the desired file + signal_file_path = os.path.join( + finetune_dir, f'.node_{dist.get_node_rank()}_local_rank0_completed') + if dist.get_local_rank() == 0: + try: + get_file(path=name, destination=destination, overwrite=True) + except FileNotFoundError as e: + if extension == SUPPORTED_EXTENSIONS[-1]: + files_searched = [ + f'{cfg.dataset.hf_name}/{cfg.dataset.split}{ext}' + for ext in SUPPORTED_EXTENSIONS + ] + raise FileNotFoundError( + f'Could not find a file with any of ' + \ + f'the supported extensions: {SUPPORTED_EXTENSIONS}\n' + \ + f'at {files_searched}' + ) from e + else: + log.debug( + f'Could not find {name}, looking for another extension') + continue + + os.makedirs(os.path.dirname(signal_file_path), exist_ok=True) + with open(signal_file_path, 'wb') as f: + f.write(b'local_rank0_completed_download') + + # Avoid the collective call until the local rank zero has finished trying to download the dataset + # so that we don't timeout for large downloads. This syncs all processes on the node + with dist.local_rank_zero_download_and_wait(signal_file_path): + # Then, wait to ensure every node has finished trying to download the dataset + dist.barrier() + + # clean up signal file + if dist.get_local_rank() == 0: + os.remove(signal_file_path) + dist.barrier() + break + return finetune_dir + From b47c878ad2d094b91d849c32775354e74c9a4446 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 10:48:16 -0800 Subject: [PATCH 34/63] update --- llmfoundry/utils/validation_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index dcbbe78ecd..a2f0bdc7ed 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -242,9 +242,9 @@ def parse_args( tokenizer, eos_text = '', no_wrap = False , processes = 32, - reprocess = False ) -> Namespace: + reprocess = True ) -> Namespace: parsed = Namespace(tokenizer = tokenizer, - concat_tokens = model_max_length, + concat_tokens = concat_tokens, output_folder = output_folder, input_folder = input_folder, eos_text = eos_text, From fa8f3d96e7bf53f8e21ed0fefc8dfba1bb269c18 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Thu, 11 Jan 2024 14:43:58 -0500 Subject: [PATCH 35/63] remove fused layernorm (#859) --- llmfoundry/utils/builders.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 404ad604ab..75438b895e 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -243,8 +243,6 @@ def build_algorithm(name: str, kwargs: Dict[str, Any]) -> Algorithm: return algorithms.GradientClipping(**kwargs) elif name == 'alibi': return algorithms.Alibi(**kwargs) - elif name == 'fused_layernorm': - return algorithms.FusedLayerNorm(**kwargs) elif name == 'gated_linear_units': return algorithms.GatedLinearUnits(**kwargs) elif name == 'low_precision_layernorm': From 13fd34c79284965155c4e9edabd32e6ab2b48f71 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 12:27:37 -0800 Subject: [PATCH 36/63] update --- llmfoundry/data/finetuning/dataloader.py | 17 +++++++++++++++-- llmfoundry/utils/validation_utils.py | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 4e1c3bbf9f..00cdc76d29 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -5,6 +5,7 @@ from typing import Tuple, Union import torch +import datasets as hf_datasets from composer.core.data_spec import DataSpec from composer.utils import dist, get_file, parse_uri from omegaconf import DictConfig @@ -26,7 +27,7 @@ def build_finetuning_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - device_batch_size: int) -> DataSpec: + device_batch_size: int) -> Tuple[DataSpec, hf_datasets.Dataset]: """Builds a finetuning dataloader for training or evaluating. The underlying dataset can be built through one of two code paths: @@ -192,6 +193,18 @@ def build_finetuning_dataloader(cfg: DictConfig, tokenizer=tokenizer, hf_kwargs=cfg.dataset.get('hf_kwargs', {})) + detected_cpu_count = os.cpu_count() or 1 + detected_cpus_with_margin = detected_cpu_count - 8 + num_cpus_to_use = max(1, detected_cpus_with_margin) + + columns_to_remove = [] + token_lens = dataset.map( + lambda ex: len(ex), + batched=False, + num_proc=num_cpus_to_use, + desc='List of Token length', + ) + # Ensure dataset is large enough. if cfg.drop_last: world_size = dist.get_world_size() @@ -231,7 +244,7 @@ def build_finetuning_dataloader(cfg: DictConfig, token_counting_func = get_tokens_per_batch_func() - return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func) + return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func), token_lens def _validate_config(dataset_cfg: DictConfig) -> None: diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index a2f0bdc7ed..dd0df76fe4 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -89,7 +89,7 @@ def token_counts_and_validation(FT_API_args): token_counting_func = dataspec.get_num_tokens_in_batch total_tokens = [] - for batch in dataloader: + for batch in tqdm(dataloader): n_batch_tokens = token_counting_func(batch) if n_batch_tokens == 0: raise ValueError("Empty train sample") From 610f669486ff9414ece6a8da8612839df54a2a6f Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 12:32:41 -0800 Subject: [PATCH 37/63] update --- llmfoundry/data/finetuning/dataloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 00cdc76d29..52775427b7 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -199,7 +199,7 @@ def build_finetuning_dataloader(cfg: DictConfig, columns_to_remove = [] token_lens = dataset.map( - lambda ex: len(ex), + lambda ex: {'ntokens': len(ex)}, batched=False, num_proc=num_cpus_to_use, desc='List of Token length', From 9f2e51b3a15f6fc4d6395e8bf8139b092596f3ec Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 13:55:20 -0800 Subject: [PATCH 38/63] update --- llmfoundry/utils/__init__.py | 1 + llmfoundry/utils/validation_utils.py | 55 ++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py index 3b92105751..a9c633d9f2 100644 --- a/llmfoundry/utils/__init__.py +++ b/llmfoundry/utils/__init__.py @@ -43,6 +43,7 @@ 'pop_config', 'create_om_cfg', 'token_counts_and_validation', + 'token_counts', 'check_HF_datasets', 'is_hf_dataset_path', 'is_uc_delta_table', diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index dd0df76fe4..6af8b8fed8 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -97,6 +97,61 @@ def token_counts_and_validation(FT_API_args): return total_tokens +from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, + Union, cast) +import torch + +def get_num_samples_in_batch(batch:dict) -> int: + decoder_only = True + + if not isinstance(batch, Mapping) or ('attention_mask' not in batch and + 'input_ids' not in batch): + raise ValueError( + 'get_tokens_per_batch_func() requires a batch with an attention_mask key or an input_ids key' + ) + + if not decoder_only and 'decoder_attention_mask' not in batch: + raise ValueError( + 'get_tokens_per_batch_func() for encoder decoder requires a batch with a decoder_attention_mask key' + ) + + # Count number of non padding tokens in batch + if 'attention_mask' in batch: + input_ids_tokens = int(sum(batch['attention_mask'])) + else: + input_ids_tokens = batch['input_ids'].numel() + + # For encoder decoder models only + decoder_input_ids_tokens = 0 + if not decoder_only: + decoder_input_ids_tokens = int( + torch.sum(batch['decoder_attention_mask']).item()) + + return {'ntokens': input_ids_tokens + decoder_input_ids_tokens} + +def token_counts(FT_API_args): + from llmfoundry.data.finetuning import build_finetuning_dataloader + + cfg, tokenizer = create_om_cfg(FT_API_args) + + device_batch_size = 1 + dataspec, token_lens = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) + dataloader = dataspec.dataloader + + detected_cpu_count = os.cpu_count() or 1 + detected_cpus_with_margin = detected_cpu_count - 8 + num_cpus_to_use = max(1, detected_cpus_with_margin) + + token_lens = ds.map( + get_num_samples_in_batch, + batched=False, + num_proc=num_cpus_to_use, + desc='List of Token length', + ) + + return token_lens + + def check_HF_datasets(dataset_names_with_splits: list): token = os.environ.get('HUGGING_FACE_HUB_TOKEN') for dataset_name_with_split in dataset_names_with_splits: From ec68f10f1d0ca0d1b2ca1b346ca7c83f963ed00c Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 14:12:37 -0800 Subject: [PATCH 39/63] update --- llmfoundry/utils/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py index a9c633d9f2..c4f1d0c1cd 100644 --- a/llmfoundry/utils/__init__.py +++ b/llmfoundry/utils/__init__.py @@ -15,7 +15,7 @@ download_from_cache_server, download_from_hf_hub) from llmfoundry.utils.validation_utils import ( - create_om_cfg, token_counts_and_validation, + create_om_cfg, token_counts_and_validation, token_counts, check_HF_datasets, is_hf_dataset_path, is_uc_delta_table, pandas_processing_fn, integrity_check, convert_text_to_mds, parse_args, _args_str) From 1e76068e0efce93119b91b58183bbce029a5b1b8 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 14:23:19 -0800 Subject: [PATCH 40/63] update --- llmfoundry/utils/validation_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index 6af8b8fed8..66329fcf3a 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -135,14 +135,14 @@ def token_counts(FT_API_args): cfg, tokenizer = create_om_cfg(FT_API_args) device_batch_size = 1 - dataspec, token_lens = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) + dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) dataloader = dataspec.dataloader detected_cpu_count = os.cpu_count() or 1 detected_cpus_with_margin = detected_cpu_count - 8 num_cpus_to_use = max(1, detected_cpus_with_margin) - token_lens = ds.map( + token_lens = dataloader.dataset.map( get_num_samples_in_batch, batched=False, num_proc=num_cpus_to_use, From 7a5c164f8f80b4a49799f4b157aed977f51dc7d5 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 14:42:06 -0800 Subject: [PATCH 41/63] update --- llmfoundry/data/finetuning/dataloader.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 52775427b7..4e1c3bbf9f 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -5,7 +5,6 @@ from typing import Tuple, Union import torch -import datasets as hf_datasets from composer.core.data_spec import DataSpec from composer.utils import dist, get_file, parse_uri from omegaconf import DictConfig @@ -27,7 +26,7 @@ def build_finetuning_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - device_batch_size: int) -> Tuple[DataSpec, hf_datasets.Dataset]: + device_batch_size: int) -> DataSpec: """Builds a finetuning dataloader for training or evaluating. The underlying dataset can be built through one of two code paths: @@ -193,18 +192,6 @@ def build_finetuning_dataloader(cfg: DictConfig, tokenizer=tokenizer, hf_kwargs=cfg.dataset.get('hf_kwargs', {})) - detected_cpu_count = os.cpu_count() or 1 - detected_cpus_with_margin = detected_cpu_count - 8 - num_cpus_to_use = max(1, detected_cpus_with_margin) - - columns_to_remove = [] - token_lens = dataset.map( - lambda ex: {'ntokens': len(ex)}, - batched=False, - num_proc=num_cpus_to_use, - desc='List of Token length', - ) - # Ensure dataset is large enough. if cfg.drop_last: world_size = dist.get_world_size() @@ -244,7 +231,7 @@ def build_finetuning_dataloader(cfg: DictConfig, token_counting_func = get_tokens_per_batch_func() - return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func), token_lens + return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func) def _validate_config(dataset_cfg: DictConfig) -> None: From 5b413f55f47ed265e792a740160065b1bd910a07 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 15:07:57 -0800 Subject: [PATCH 42/63] update --- llmfoundry/utils/validation_utils.py | 37 +++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index 66329fcf3a..e5e2fb146b 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -36,7 +36,7 @@ def create_om_cfg(FT_API_args: Namespace): common_args = { 'drop_last': False, - 'num_workers': 2, + 'num_workers': 1, 'prefetch_factor': 2, 'pin_memory': False, 'persistent_workers': False, @@ -801,3 +801,38 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: break return finetune_dir + +def plot_token_hist(data, save_plot_path=None): + import pandas as pd + import matplotlib.pyplot as plt + + # Figure and Axis Setup + plt.figure(figsize=(10, 6)) + ax = plt.gca() + + # Histogram Plotting + data.hist(bins=100, edgecolor='black', color='skyblue', alpha=0.7, ax=ax) + + # Aesthetics + plt.title('Histogram of Token Counts') + plt.xlabel('Token Count') + plt.ylabel('Frequency') + + # Grid and Layout + plt.grid(axis='y', alpha=0.75) + plt.tight_layout() + + # Statistical Information (optional) + mean_val = data.mean() + median_val = data.median() + plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=1) + plt.axvline(median_val, color='green', linestyle='dashed', linewidth=1) + min_ylim, max_ylim = plt.ylim() + plt.text(mean_val*1.1, max_ylim*0.9, f'Mean: {mean_val:.2f}') + plt.text(median_val*1.1, max_ylim*0.8, f'Median: {median_val:.2f}') + + if save_plot_path is not None: + plt.savefig(save_plot_path) + + # Show the Plot + plt.show() From a1aa31f1e50056d7b6891fadf02d1edea3750502 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 15:20:30 -0800 Subject: [PATCH 43/63] update --- llmfoundry/utils/validation_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index e5e2fb146b..ecc15566d4 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -142,6 +142,8 @@ def token_counts(FT_API_args): detected_cpus_with_margin = detected_cpu_count - 8 num_cpus_to_use = max(1, detected_cpus_with_margin) + num_cpus_to_use = 1 + token_lens = dataloader.dataset.map( get_num_samples_in_batch, batched=False, From d24fd5ceb1c695d1a354ce009795f0bc98fee8f4 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 15:24:27 -0800 Subject: [PATCH 44/63] update --- llmfoundry/utils/__init__.py | 3 ++- llmfoundry/utils/validation_utils.py | 4 +--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py index c4f1d0c1cd..d91194df12 100644 --- a/llmfoundry/utils/__init__.py +++ b/llmfoundry/utils/__init__.py @@ -18,7 +18,7 @@ create_om_cfg, token_counts_and_validation, token_counts, check_HF_datasets, is_hf_dataset_path, is_uc_delta_table, pandas_processing_fn, integrity_check, convert_text_to_mds, - parse_args, _args_str) + parse_args, _args_str, plot_hist) except ImportError as e: raise ImportError( @@ -52,4 +52,5 @@ 'convert_text_to_mds', 'parse_args', '_args_str', + 'plot_hist', ] diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index ecc15566d4..2a24047b74 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -142,8 +142,6 @@ def token_counts(FT_API_args): detected_cpus_with_margin = detected_cpu_count - 8 num_cpus_to_use = max(1, detected_cpus_with_margin) - num_cpus_to_use = 1 - token_lens = dataloader.dataset.map( get_num_samples_in_batch, batched=False, @@ -804,7 +802,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: return finetune_dir -def plot_token_hist(data, save_plot_path=None): +def plot_hist(data, save_plot_path=None): import pandas as pd import matplotlib.pyplot as plt From da3bea1487c589631ac95b3be346891f09d993cf Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 18:41:23 -0800 Subject: [PATCH 45/63] Remove hardcoded combined.jsonl with a flag (#861) * Remove hardcoded combined.jsonl with a flag * update * change output_json_path output_json_folder --------- Co-authored-by: Xiaohan Zhang --- scripts/data_prep/convert_delta_to_json.py | 61 +++++++++++-------- .../data_prep/test_convert_delta_to_json.py | 13 ++-- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/scripts/data_prep/convert_delta_to_json.py b/scripts/data_prep/convert_delta_to_json.py index 8986849a42..029ce7f5c3 100644 --- a/scripts/data_prep/convert_delta_to_json.py +++ b/scripts/data_prep/convert_delta_to_json.py @@ -198,14 +198,14 @@ def run_query( raise ValueError(f'Unrecognized method: {method}') -def get_args(signed: List, json_output_path: str, columns: List) -> Iterable: +def get_args(signed: List, json_output_folder: str, columns: List) -> Iterable: for i, r in enumerate(signed): - yield (i, r.url, json_output_path, columns) + yield (i, r.url, json_output_folder, columns) def download(ipart: int, url: str, - json_output_path: str, + json_output_folder: str, columns: Optional[List] = None, resp_format: str = 'arrow', compressed: bool = False) -> None: @@ -214,7 +214,7 @@ def download(ipart: int, Args: ipart (int): presigned url id url (str): presigned url - json_output_path (str): directory to save the ipart_th segment of dataframe + json_output_folder (str): directory to save the ipart_th segment of dataframe columns (list): schema to save to json resp_format (str): whether to use arrow or json when collect compressed (bool): if data is compressed before downloading. Need decompress if compressed=True. @@ -224,7 +224,7 @@ def download(ipart: int, if resp_format == 'json': data = resp.json() pd.DataFrame(data, columns=columns).to_json(os.path.join( - json_output_path, 'part_' + str(ipart) + '.jsonl'), + json_output_folder, 'part_' + str(ipart) + '.jsonl'), orient='records', lines=True) return @@ -242,7 +242,7 @@ def download(ipart: int, # Convert the PyArrow table into a pandas DataFrame df = table.to_pandas() - df.to_json(os.path.join(json_output_path, + df.to_json(os.path.join(json_output_folder, 'part_' + str(ipart) + '.jsonl'), orient='records', lines=True, @@ -256,7 +256,7 @@ def download_starargs(args: Tuple) -> None: def fetch_data(method: str, cursor: Optional[Cursor], sparkSession: Optional[SparkSession], start: int, end: int, order_by: str, tablename: str, columns_str: str, - json_output_path: str) -> None: + json_output_folder: str) -> None: """Fetches a specified range of rows from a given table to a json file. This function executes a SQL query to retrieve a range of rows, determined by 'start' and 'end' indexes, @@ -271,7 +271,7 @@ def fetch_data(method: str, cursor: Optional[Cursor], order_by (str): The column name to use for ordering the rows. tablename (str): The name of the table from which to fetch the data. columns_str (str): The string representation of the columns to select from the table. - json_output_path (str): The file path where the resulting JSON file will be saved. + json_output_folder (str): The file path where the resulting JSON file will be saved. Returns: None: The function doesn't return any value, but writes the result to a JSONL file. @@ -301,7 +301,7 @@ def fetch_data(method: str, cursor: Optional[Cursor], records = [r.asDict() for r in ans] # pyright: ignore pdf = pd.DataFrame.from_dict(records) - pdf.to_json(os.path.join(json_output_path, f'part_{start+1}_{end}.jsonl'), + pdf.to_json(os.path.join(json_output_folder, f'part_{start+1}_{end}.jsonl'), orient='records', lines=True) @@ -309,7 +309,7 @@ def fetch_data(method: str, cursor: Optional[Cursor], def fetch( method: str, tablename: str, - json_output_path: str, + json_output_folder: str, batch_size: int = 1 << 30, processes: int = 1, sparkSession: Optional[SparkSession] = None, @@ -320,7 +320,7 @@ def fetch( Args: method (str): dbconnect or dbsql tablename (str): catalog.scheme.tablename on UC - json_output_path (str): path to write the result json file to + json_output_folder (str): path to write the result json file to batch_size (int): number of rows that dbsql fetches each time to avoid OOM processes (int): max number of processes to use to parallelize the fetch sparkSession (pyspark.sql.sparksession): spark session @@ -358,7 +358,7 @@ def fetch( signed, _, _ = df.collect_cf('arrow') # pyright: ignore log.info(f'len(signed) = {len(signed)}') - args = get_args(signed, json_output_path, columns) + args = get_args(signed, json_output_folder, columns) # Stopping the SparkSession to avoid spilling connection state into the subprocesses. sparkSession.stop() @@ -371,7 +371,7 @@ def fetch( log.warning(f'batch {start}') end = min(start + batch_size, nrows) fetch_data(method, cursor, sparkSession, start, end, order_by, - tablename, columns_str, json_output_path) + tablename, columns_str, json_output_folder) if cursor is not None: cursor.close() @@ -381,21 +381,24 @@ def fetch_DT(args: Namespace) -> None: """Fetch UC Delta Table to local as jsonl.""" log.info(f'Start .... Convert delta to json') - obj = urllib.parse.urlparse(args.json_output_path) + obj = urllib.parse.urlparse(args.json_output_folder) if obj.scheme != '': raise ValueError( - f'Check the json_output_path and verify it is a local path!') + f'Check the json_output_folder and verify it is a local path!') - if os.path.exists(args.json_output_path): - if not os.path.isdir(args.json_output_path) or os.listdir( - args.json_output_path): + if os.path.exists(args.json_output_folder): + if not os.path.isdir(args.json_output_folder) or os.listdir( + args.json_output_folder): raise RuntimeError( - f'A file or a folder {args.json_output_path} already exists and is not empty. Remove it and retry!' + f'A file or a folder {args.json_output_folder} already exists and is not empty. Remove it and retry!' ) - os.makedirs(args.json_output_path, exist_ok=True) + os.makedirs(args.json_output_folder, exist_ok=True) - log.info(f'Directory {args.json_output_path} created.') + if not args.json_output_filename.endswith('.jsonl'): + raise ValueError('json_output_filename needs to be a jsonl file') + + log.info(f'Directory {args.json_output_folder} created.') method = 'dbsql' dbsql = None @@ -451,16 +454,16 @@ def fetch_DT(args: Namespace) -> None: 'Failed to create sql connection to db workspace. To use sql connect, you need to provide http_path and cluster_id!' ) from e - fetch(method, args.delta_table_name, args.json_output_path, args.batch_size, - args.processes, sparkSession, dbsql) + fetch(method, args.delta_table_name, args.json_output_folder, + args.batch_size, args.processes, sparkSession, dbsql) if dbsql is not None: dbsql.close() # combine downloaded jsonl into one big jsonl for IFT iterative_combine_jsons( - args.json_output_path, - os.path.join(args.json_output_path, 'combined.jsonl')) + args.json_output_folder, + os.path.join(args.json_output_folder, args.json_output_filename)) if __name__ == '__main__': @@ -471,7 +474,7 @@ def fetch_DT(args: Namespace) -> None: required=True, type=str, help='UC table ..
') - parser.add_argument('--json_output_path', + parser.add_argument('--json_output_folder', required=True, type=str, help='Local path to save the converted json') @@ -505,6 +508,12 @@ def fetch_DT(args: Namespace) -> None: help= 'Use serverless or not. Make sure the workspace is entitled with serverless' ) + parser.add_argument( + '--json_output_filename', + required=False, + type=str, + default='train-00000-of-00001.jsonl', + help='The combined final jsonl that combines all partitioned jsonl') args = parser.parse_args() from databricks.sdk import WorkspaceClient diff --git a/tests/a_scripts/data_prep/test_convert_delta_to_json.py b/tests/a_scripts/data_prep/test_convert_delta_to_json.py index 39bc5d8099..b366d8635a 100644 --- a/tests/a_scripts/data_prep/test_convert_delta_to_json.py +++ b/tests/a_scripts/data_prep/test_convert_delta_to_json.py @@ -27,7 +27,7 @@ def test_stream_delta_to_json(self, mock_workspace_client: Any, args = MagicMock() args.delta_table_name = 'test_table' - args.json_output_path = '/path/to/jsonl' + args.json_output_folder = '/path/to/jsonl' args.DATABRICKS_HOST = 'test_host' args.DATABRICKS_TOKEN = 'test_token' args.http_path = 'test_path' @@ -36,6 +36,7 @@ def test_stream_delta_to_json(self, mock_workspace_client: Any, args.cluster_id = '1234' args.debug = False args.use_serverless = False + args.json_output_filename = 'combined.jsonl' mock_cluster_get = MagicMock() mock_cluster_get.return_value = MagicMock( @@ -154,7 +155,7 @@ def test_dbconnect_called(self, mock_fetch: Any, mock_combine_jsons: Any, args = MagicMock() args.delta_table_name = 'test_table' - args.json_output_path = '/path/to/jsonl' + args.json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) args.http_path = None args.cluster_id = '1234' @@ -192,7 +193,7 @@ def test_sqlconnect_called_dbr13(self, mock_fetch: Any, args = MagicMock() args.delta_table_name = 'test_table' - args.json_output_path = '/path/to/jsonl' + args.json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) args.http_path = 'test_path' args.cluster_id = '1234' @@ -225,7 +226,7 @@ def test_sqlconnect_called_dbr14(self, mock_fetch: Any, args = MagicMock() args.delta_table_name = 'test_table' - args.json_output_path = '/path/to/jsonl' + args.json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) args.http_path = 'test_path' args.cluster_id = '1234' @@ -258,7 +259,7 @@ def test_sqlconnect_called_https(self, mock_fetch: Any, args = MagicMock() args.delta_table_name = 'test_table' - args.json_output_path = '/path/to/jsonl' + args.json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) args.http_path = 'test_path' args.cluster_id = '1234' @@ -288,7 +289,7 @@ def test_serverless(self, mock_fetch: Any, mock_combine_jsons: Any, args = MagicMock() args.delta_table_name = 'test_table' - args.json_output_path = '/path/to/jsonl' + args.json_output_folder = '/path/to/jsonl' # Execute function with http_path=None (should use dbconnect) args.http_path = 'test_path' args.cluster_id = '1234' From 936e3a1bd5f16fa3c2510c1af7753493635498be Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Thu, 11 Jan 2024 22:49:07 -0500 Subject: [PATCH 46/63] bump (#828) --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5444352cf7..2c4a05f396 100644 --- a/setup.py +++ b/setup.py @@ -96,13 +96,13 @@ extra_deps['gpu'] = [ 'flash-attn==1.0.9', - 'mosaicml-turbo==0.0.7', + 'mosaicml-turbo==0.0.8', # PyPI does not support direct dependencies, so we remove this line before uploading from PyPI 'xentropy-cuda-lib@git+https://github.com/HazyResearch/flash-attention.git@v1.0.9#subdirectory=csrc/xentropy', ] extra_deps['gpu-flash2'] = [ 'flash-attn==2.4.2', - 'mosaicml-turbo==0.0.7', + 'mosaicml-turbo==0.0.8', ] extra_deps['peft'] = [ From 55fce37444b3bf61ec559626efd28cf8e824cfb1 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 22:23:35 -0800 Subject: [PATCH 47/63] Add dask and dataframe_to_mds --- llmfoundry/utils/__init__.py | 3 +- llmfoundry/utils/validation_utils.py | 460 +++++++++++++++++++++++++-- setup.py | 1 + 3 files changed, 440 insertions(+), 24 deletions(-) diff --git a/llmfoundry/utils/__init__.py b/llmfoundry/utils/__init__.py index d91194df12..a6fa1b0264 100644 --- a/llmfoundry/utils/__init__.py +++ b/llmfoundry/utils/__init__.py @@ -18,7 +18,7 @@ create_om_cfg, token_counts_and_validation, token_counts, check_HF_datasets, is_hf_dataset_path, is_uc_delta_table, pandas_processing_fn, integrity_check, convert_text_to_mds, - parse_args, _args_str, plot_hist) + parse_args, _args_str, plot_hist, dataframe_to_mds) except ImportError as e: raise ImportError( @@ -53,4 +53,5 @@ 'parse_args', '_args_str', 'plot_hist', + 'dataframe_to_mds', ] diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index 2a24047b74..9d28e639c5 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -201,31 +201,8 @@ def is_uc_delta_table(name: str): """ return '.' in name and '/' not in name and '\\' not in name and len(name.split('.'))==3 -def pandas_processing_fn(df: pd.DataFrame, - **args: Any) -> Iterable[Dict[str, bytes]]: - """Tokenize helper function for dataframe_to_mds. - - Args: - df (pandas.DataFrame): The input pandas DataFrame that needs to be processed. - **args : Additional arguments to be passed to the 'process_some_data' function during processing. - Returns: - iterable obj - """ - hf_dataset = hf_datasets.Dataset.from_pandas(df=df) - tokenizer = AutoTokenizer.from_pretrained(args['tokenizer']) - tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace - dataset = ConcatTokensDataset( - hf_dataset=hf_dataset, - max_length=args.get('concat_tokens', None), - tokenizer=tokenizer, - eos_text=args.get('eos_text', None), - bos_text=args.get('bos_text', None), - no_wrap=args.get('no_wrap', None), - ) - for sample in dataset: # pyright: ignore - yield sample def integrity_check(out: Union[str, Tuple[str, str]]): """Check if the index file has integrity. @@ -278,6 +255,8 @@ def count_shards(mds_root: str): parse_uri) from streaming import MDSWriter from tqdm import tqdm + +import datasets as hf_datasets from transformers import AutoTokenizer from llmfoundry.data import ConcatTokensDataset @@ -836,3 +815,438 @@ def plot_hist(data, save_plot_path=None): # Show the Plot plt.show() + + +def get_import_exception_message(package_name: str, extra_deps: str) -> str: + """Get import exception message. + + Args: + package_name (str): Package name. + + Returns: + str: Exception message. + """ + return f'BYOD was installed without {extra_deps} support. ' + \ + f'To use {extra_deps} related packages with BYOD, run ' + \ + f'`pip install \'mosaicml-byod[{extra_deps}]\'`.' + +def pandas_processing_fn(df: pd.DataFrame, + **args: Any) -> Iterable[Dict[str, bytes]]: + """Tokenize helper function for dataframe_to_mds. + + Args: + df (pandas.DataFrame): The input pandas DataFrame that needs to be processed. + **args : Additional arguments to be passed to the 'process_some_data' function during processing. + + Returns: + iterable obj + """ + import datasets as hf_datasets + from transformers import AutoTokenizer + + hf_dataset = hf_datasets.Dataset.from_pandas(df=df) + tokenizer = AutoTokenizer.from_pretrained(args['tokenizer']) + tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace + dataset = ConcatTokensDataset( + hf_dataset=hf_dataset, + max_length=args.get('concat_tokens', None), + tokenizer=tokenizer, + eos_text=args.get('eos_text', None), + bos_text=args.get('bos_text', None), + no_wrap=args.get('no_wrap', None), + ) + + for sample in dataset: # pyright: ignore + yield sample + +# Copyright 2023 MosaicML Streaming authors +# SPDX-License-Identifier: Apache-2.0 + +"""A utility to convert spark dataframe to MDS.""" + +import logging +import os +import shutil +from collections.abc import Iterable +from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union + +import pandas as pd + + +try: + from pyspark import TaskContext + from pyspark.sql.dataframe import DataFrame as SparkDataFrame + from pyspark.sql.types import (ArrayType, BinaryType, BooleanType, ByteType, DateType, + DayTimeIntervalType, DecimalType, DoubleType, FloatType, + IntegerType, LongType, MapType, ShortType, StringType, + StructField, StructType, TimestampNTZType, TimestampType) +except ImportError as e: + e.msg = get_import_exception_message(e.name, extra_deps='spark') # pyright: ignore + raise e + +try: + from dask.dataframe import DataFrame as DaskDataFrame + from dask.distributed import Client, LocalCluster +except ImportError as e: + e.msg = get_import_exception_message(e.name, extra_deps='dask') # pyright: ignore + raise e + +try: + from streaming.base.util import merge_index as do_merge_index + from streaming import MDSWriter + from streaming.base.format.index import get_index_basename + from streaming.base.format.mds.encodings import _encodings + from streaming.base.storage.upload import CloudUploader +except ImportError as e: + e.msg = get_import_exception_message(e.name, extra_deps='streaming') # pyright: ignore + raise e + +logger = logging.getLogger(__name__) + +MAPPING_SPARK_TO_MDS = { + ByteType: 'uint8', + ShortType: 'uint16', + IntegerType: 'int', + LongType: 'int64', + FloatType: 'float32', + DoubleType: 'float64', + DecimalType: 'str_decimal', + StringType: 'str', + BinaryType: 'bytes', + BooleanType: None, + TimestampType: None, + TimestampNTZType: None, + DateType: None, + DayTimeIntervalType: None, + ArrayType: None, + MapType: None, + StructType: None, + StructField: None +} + +MAPPING_DASK_TO_MDS = { + 'object' : 'str', + 'int64' : 'int64', + 'string' : 'str' +} + +def infer_dataframe_schema(dataframe: Union[SparkDataFrame, DaskDataFrame], + user_defined_cols: Optional[Dict[str, Any]] = None) -> Optional[Dict]: + """Retrieve schema to construct a dictionary or do sanity check for MDSWriter. + + Args: + dataframe (spark dataframe): dataframe to inspect schema + user_defined_cols (Optional[Dict[str, Any]]): user specified schema for MDSWriter + + Returns: + If user_defined_cols is None, return schema_dict (dict): column name and dtypes that are + supported by MDSWriter, else None + + Raises: + ValueError if any of the datatypes are unsupported by MDSWriter. + """ + + def map_spark_dtype(spark_data_type: Any) -> str: + """Map spark data type to mds supported types. + + Args: + spark_data_type: https://spark.apache.org/docs/latest/sql-ref-datatypes.html + + Returns: + str: corresponding mds datatype for input. + + Raises: + raise ValueError if no mds datatype is found for input type + """ + mds_type = MAPPING_SPARK_TO_MDS.get(type(spark_data_type), None) + if mds_type is None: + raise ValueError(f'{spark_data_type} is not supported by MDSWriter') + return mds_type + + def map_dask_dtype(dask_data_type: Any) -> str: + """Map dask/pandas data type to mds supported types. + """ + mds_type = MAPPING_DASK_TO_MDS.get(str(dask_data_type), None) + if mds_type not in mds_supported_dtypes: + raise ValueError(f'{dask_data_type} is not supported by MDSWriter') + return mds_type + + mds_supported_dtypes = { + mds_type for mds_type in MAPPING_SPARK_TO_MDS.values() if mds_type is not None + } + + # user has provided schema, we just check if mds supports the dtype + if user_defined_cols is not None: + for col_name, user_dtype in user_defined_cols.items(): + if col_name not in dataframe.columns: + raise ValueError( + f'{col_name} is not a column of input dataframe: {dataframe.columns}') + if user_dtype not in mds_supported_dtypes: + raise ValueError(f'{user_dtype} is not supported by MDSWriter') + + if isinstance(dataframe, SparkDataFrame): + actual_spark_dtype = dataframe.schema[col_name].dataType + mapped_mds_dtype = map_spark_dtype(actual_spark_dtype) + else: + actual_dask_dtype = dataframe.dtypes.to_dict()[col_name] + mapped_mds_dtype = map_dask_dtype(actual_dask_dtype) + + if user_dtype != mapped_mds_dtype: + raise ValueError( + f'Mismatched types: column name `{col_name}` is `{mapped_mds_dtype}` in ' + + f'DataFrame but `{user_dtype}` in user_defined_cols') + return None + + schema_dict = {} + + if isinstance(dataframe, SparkDataFrame): + schema = dataframe.schema + for field in schema: + dtype = map_spark_dtype(field.dataType) + if dtype in _encodings: + schema_dict[field.name] = dtype + else: + raise ValueError(f'{dtype} is not supported by MDSWriter') + else: + schema_dict = dataframe.dtypes.to_dict() + for k, v in schema_dict.items(): + schema_dict[k] = map_dask_dtype(v) + + return schema_dict + + +def dataframeToMDS(dataframe: Union[SparkDataFrame, DaskDataFrame], + merge_index: bool = True, + mds_kwargs: Optional[Dict[str, Any]] = None, + udf_iterable: Optional[Callable] = None, + udf_kwargs: Optional[Dict[str, Any]] = None) -> Tuple[Any, int]: + """Deprecated API Signature. + + To be replaced by dataframe_to_mds + """ + logger.warning( + 'The DataframeToMDS signature has been deprecated and will be removed in Streaming 0.8. ' + + 'Use dataframe_to_mds with the same arguments going forward') + return dataframe_to_mds(dataframe, merge_index, mds_kwargs, udf_iterable, udf_kwargs) + + +def dataframe_to_mds(dataframe: Union[SparkDataFrame, DaskDataFrame], + merge_index: bool = True, + mds_kwargs: Optional[Dict[str, Any]] = None, + udf_iterable: Optional[Callable] = None, + udf_kwargs: Optional[Dict[str, Any]] = None) -> Tuple[Any, int]: + """Execute a spark dataframe to MDS conversion process. + + This method orchestrates the conversion of a spark dataframe into MDS format by processing the + input data, applying a user-defined iterable function if provided, and writing the results to + an MDS-compatible format. The converted data is saved to mds_path. + + Args: + dataframe (pyspark.sql.DataFrame or dask.dataframe): A DataFrame containing Delta Lake data. + merge_index (bool): Whether to merge MDS index files. Defaults to ``True``. + mds_kwargs (dict): Refer to https://docs.mosaicml.com/projects/streaming/en/stable/ + api_reference/generated/streaming.MDSWriter.html + udf_iterable (Callable or None): A user-defined function that returns an iterable over the + dataframe. udf_kwargs is the k-v args for the method. Defaults to ``None``. + udf_kwargs (Dict): Additional keyword arguments to pass to the pandas processing + function if provided. Defaults to an empty dictionary. + + Returns: + mds_path (str or (str,str)): actual local and remote path were used + fail_count (int): number of records failed to be converted + + Notes: + - The method creates a SparkSession if not already available. + - The 'udf_kwargs' dictionaries can be used to pass additional + keyword arguments to the udf_iterable. + - If udf_iterable is set, schema check will be skipped because the user defined iterable + can create new columns. User must make sure they provide correct mds_kwargs[columns] + """ + + def write_mds_dask(pdf: pd.DataFrame, partition_info=None): + + fid = partition_info['number'] # pdf.index[0] + print('fid = ', fid) + if mds_path[1] == '': # only local + output = os.path.join(mds_path[0], f'{fid}') + partition_path = (output, '') + else: + output = (os.path.join(mds_path[0], f'{fid}'), os.path.join(mds_path[1], f'{fid}')) + partition_path = output + + if mds_kwargs: + kwargs = mds_kwargs.copy() + kwargs['out'] = output + else: + kwargs = {} + + if merge_index: + kwargs['keep_local'] = True # need to keep workers' locals to do merge + + + if udf_iterable is not None: + records = udf_iterable(pdf, **udf_kwargs or {}) + else: + records = pdf.to_dict('records') + assert isinstance( + records, + Iterable), (f'pandas_processing_fn needs to return an iterable instead of a ' + + f'{type(records)}') + + with MDSWriter(**kwargs) as mds_writer: + for sample in records: + try: + mds_writer.write(sample) + except Exception as ex: + raise RuntimeError(f'failed to write sample: {sample}') from ex + count += 1 + + return pd.DataFrame({'mds_path_local': [os.path.join(partition_path[0], get_index_basename())], 'mds_path_remote': [os.path.join(partition_path[1], get_index_basename()) if partition_path[1] != '' else ''] , 'fail_count' : [0] }) + return pdf.drop(cols, axis=1) + + + + def write_mds_spark(iterator: Iterable): + """Worker node writes iterable to MDS datasets locally.""" + context = TaskContext.get() + + if context is not None: + fid = context.taskAttemptId() + else: + raise RuntimeError('TaskContext.get() returns None') + + if mds_path[1] == '': # only local + output = os.path.join(mds_path[0], f'{fid}') + partition_path = (output, '') + else: + output = (os.path.join(mds_path[0], f'{fid}'), os.path.join(mds_path[1], f'{fid}')) + partition_path = output + + if mds_kwargs: + kwargs = mds_kwargs.copy() + kwargs['out'] = output + else: + kwargs = {} + + if merge_index: + kwargs['keep_local'] = True # need to keep workers' locals to do merge + + count = 0 + + with MDSWriter(**kwargs) as mds_writer: + for pdf in iterator: + if udf_iterable is not None: + records = udf_iterable(pdf, **udf_kwargs or {}) + else: + records = pdf.to_dict('records') + assert isinstance( + records, + Iterable), (f'pandas_processing_fn needs to return an iterable instead of a ' + + f'{type(records)}') + + for sample in records: + try: + mds_writer.write(sample) + except Exception as ex: + raise RuntimeError(f'failed to write sample: {sample}') from ex + count += 1 + + yield pd.concat([ + pd.Series([os.path.join(partition_path[0], get_index_basename())], + name='mds_path_local'), + pd.Series([ + os.path.join(partition_path[1], get_index_basename()) + if partition_path[1] != '' else '' + ], + name='mds_path_remote'), + pd.Series([count], name='fail_count') + ], + axis=1) + + if dataframe is None: + raise ValueError(f'Input dataframe is None!') + + if not (isinstance(dataframe, SparkDataFrame) or isinstance(dataframe, DaskDataFrame)): + raise ValueError(f'dataframe_to_mds only takes Spark dataframe or Dask dataframe!') + + if (isinstance(dataframe, SparkDataFrame) and dataframe.isEmpty()) or (isinstance(dataframe, DaskDataFrame) and len(dataframe.index)==0): + raise ValueError(f'Input dataframe is Empty1') + + if not mds_kwargs: + mds_kwargs = {} + + if not udf_kwargs: + udf_kwargs = {} + + if 'out' not in mds_kwargs: + raise ValueError(f'`out` and `columns` need to be specified in `mds_kwargs`') + + if udf_iterable is not None: + if 'columns' not in mds_kwargs: + raise ValueError( + f'If udf_iterable is specified, user must provide correct `columns` in the ' + + f'mds_kwargs') + logger.warning("With udf_iterable defined, it's up to the user's discretion to provide " + + "mds_kwargs[columns]'") + else: + if 'columns' not in mds_kwargs: + logger.warning( + "User's discretion required: columns arg is missing from mds_kwargs. Will be " + + 'auto-inferred') + mds_kwargs['columns'] = infer_dataframe_schema(dataframe) + logger.warning(f"Auto inferred schema: {mds_kwargs['columns']}") + else: + infer_dataframe_schema(dataframe, mds_kwargs['columns']) + + out = mds_kwargs['out'] + keep_local = False if 'keep_local' not in mds_kwargs else mds_kwargs['keep_local'] + cu = CloudUploader.get(out, keep_local=keep_local) + + # Fix output format as mds_path: Tuple(local, remote) + if cu.remote is None: + mds_path = (cu.local, '') + else: + mds_path = (cu.local, cu.remote) + + if isinstance(dataframe, SparkDataFrame): + # Prepare partition schema + result_schema = StructType([ + StructField('mds_path_local', StringType(), False), + StructField('mds_path_remote', StringType(), False), + StructField('fail_count', IntegerType(), False) + ]) + partitions = dataframe.mapInPandas(func=write_mds_spark, schema=result_schema).collect() + else: + cluster = LocalCluster(processes=False) + client = Client(cluster) + partitions = dataframe.map_partitions(write_mds_dask, meta=pd.DataFrame({'mds_path_local': str, 'mds_path_remote': str, 'fail_count': int}, index=[0])).compute() + + keep_local_files = True + # If there are no remote part, we always keep the local + # In case user forgot to set keep_local and set out to be a local path + if cu.remote is not None: # If there are no remote + if 'keep_local' in mds_kwargs and mds_kwargs['keep_local'] == False: + keep_local_files = False + + if merge_index: + if isinstance(dataframe, SparkDataFrame): + index_files = list(set([(row['mds_path_local'], row['mds_path_remote']) for row in partitions])) + else: + index_files = list(set([(row[1]['mds_path_local'], row[1]['mds_path_remote']) for row in partitions.iterrows()])) + + do_merge_index(index_files, out, keep_local=keep_local_files, download_timeout=60) + + if not keep_local_files: + shutil.rmtree(cu.local, ignore_errors=True) + + sum_fail_count = 0 + if isinstance(dataframe, SparkDataFrame): + for row in partitions: + sum_fail_count += row['fail_count'] + + if sum_fail_count > 0: + logger.warning( + f'Total failed records = {sum_fail_count}\nOverall records {dataframe.count()}') + return mds_path, sum_fail_count + + diff --git a/setup.py b/setup.py index 5444352cf7..bb31e8b9bc 100644 --- a/setup.py +++ b/setup.py @@ -68,6 +68,7 @@ 'huggingface-hub>=0.17.0,<1.0', 'beautifulsoup4>=4.12.2,<5', # required for model download utils 'tenacity>=8.2.3,<9', + 'dask[distributed]>=2023.11.0', ] extra_deps = {} From 86e2412e964d1fb1aa1731f943d33e4d13a69f29 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 22:34:18 -0800 Subject: [PATCH 48/63] update --- llmfoundry/utils/validation_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index 9d28e639c5..af86f36a28 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -139,8 +139,7 @@ def token_counts(FT_API_args): dataloader = dataspec.dataloader detected_cpu_count = os.cpu_count() or 1 - detected_cpus_with_margin = detected_cpu_count - 8 - num_cpus_to_use = max(1, detected_cpus_with_margin) + num_cpus_to_use = max(1, detected_cpu_count) token_lens = dataloader.dataset.map( get_num_samples_in_batch, From bbfec65da57a2d26b2afbfd75d517e8ebd6de4e6 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 22:48:36 -0800 Subject: [PATCH 49/63] update --- llmfoundry/utils/validation_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index af86f36a28..a21f280c55 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -33,10 +33,11 @@ def create_om_cfg(FT_API_args: Namespace): model = FT_API_args.model max_seq_len = FT_API_args.context_length + detected_cpu_count = os.cpu_count() or 1 common_args = { 'drop_last': False, - 'num_workers': 1, + 'num_workers': detected_cpu_count, 'prefetch_factor': 2, 'pin_memory': False, 'persistent_workers': False, From b2e880de21d31fcf214063fcf25526e4e8ef3dc4 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 23:02:11 -0800 Subject: [PATCH 50/63] update --- llmfoundry/data/finetuning/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index e61d138c41..f636ade640 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -434,7 +434,7 @@ def dataset_mapper(example: Dict): detected_cpu_count = os.cpu_count() or 1 detected_cpus_with_margin = detected_cpu_count - 8 - num_cpus_to_use = max(1, detected_cpus_with_margin) + num_cpus_to_use = detected_cpu_count # Hack for Valiation instead of max(1, detected_cpus_with_margin) columns_to_remove = list(dataset[0].keys()) tokenized_dataset = dataset.map( From 596443af831e8fcea2d3b0f470382f0ac356bb45 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 23:14:36 -0800 Subject: [PATCH 51/63] update --- llmfoundry/utils/validation_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index a21f280c55..57a5521079 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -1066,7 +1066,6 @@ def dataframe_to_mds(dataframe: Union[SparkDataFrame, DaskDataFrame], def write_mds_dask(pdf: pd.DataFrame, partition_info=None): fid = partition_info['number'] # pdf.index[0] - print('fid = ', fid) if mds_path[1] == '': # only local output = os.path.join(mds_path[0], f'{fid}') partition_path = (output, '') From ea651873808101aaf9ceee2775cc5cff78179330 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Thu, 11 Jan 2024 23:49:25 -0800 Subject: [PATCH 52/63] Add notebook --- notebooks/validate_and_tokenize_data.ipynb | 1202 ++++++++++++++++++++ 1 file changed, 1202 insertions(+) create mode 100644 notebooks/validate_and_tokenize_data.ipynb diff --git a/notebooks/validate_and_tokenize_data.ipynb b/notebooks/validate_and_tokenize_data.ipynb new file mode 100644 index 0000000000..6df4453e99 --- /dev/null +++ b/notebooks/validate_and_tokenize_data.ipynb @@ -0,0 +1,1202 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f275a21b-47d4-472c-972b-e2a84a597db2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# FM FT API: Validation and Cost Estimation\n", + "\n", + "#### Usage Scenario:\n", + "This notebook goes hand-in-hand with Databricks-Mosaicml's FT API. Our customers may find it useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process.\n", + "\n", + "#### Script Purpose:\n", + "- **Not for Training**: This script is not utilized during the training process.\n", + "- **Ad-Hoc Validation**: It serves as an ad-hoc utility for users to run independently prior to starting fine-tuning.\n", + "- **Data Verification**: Its primary function is to validate the user's data before they invoke the Fine-Tuning (FT) API.\n", + "- **Cost Estimation**: Users can estimate the cost implications with this script.\n", + "\n", + "#### Note on Long-Term Solution:\n", + "- **Temporary Measure**: This script is a stop-gap solution.\n", + "- **Future Development**: We are in the process of developing a long-term data preparation service, which will eventually replace this script.\n", + "\n", + "#### User Defines:\n", + "- The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption?\n", + "- For the reference, FT API expects following\n", + "```\n", + "cfg = {\n", + " model: str,\n", + " train_data_path: str,\n", + " save_folder: str,\n", + " *,\n", + " task_type: Optional[str] = \"INSTRUCTION_FINETUNE\",\n", + " eval_data_path: Optional[str] = None,\n", + " eval_prompts: Optional[List[str]] = None,\n", + " custom_weights_path: Optional[str] = None,\n", + " training_duration: Optional[str] = None,\n", + " learning_rate: Optional[float] = None,\n", + " context_length: Optional[int] = None,\n", + " experiment_trackers: Optional[List[Dict]] = None,\n", + " disable_credentials_check: Optional[bool] = None,\n", + " timeout: Optional[float] = 10,\n", + " future: Literal[False] = False,\n", + "}\n", + "``` " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3d08a21c-9f5a-4ad2-af85-e016335cc53d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Install llmfoundry Validation Branch" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6f330be7-ff76-4fa2-928f-396367b359ea", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\nWARNING: Skipping llm-foundry as it is not installed.\n\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\n" + ] + } + ], + "source": [ + "%pip uninstall -y llm-foundry" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6122e872-44b8-48a3-af61-4b907fc0a71f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34e0a248-1d33-4379-841b-6d7d123bbc8a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\nCollecting git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation\n Cloning https://github.com/XiaohanZhangCMU/llm-foundryX.git (to revision validation) to /tmp/pip-req-build-k0ts0h4y\n Running command git clone --filter=blob:none --quiet https://github.com/XiaohanZhangCMU/llm-foundryX.git /tmp/pip-req-build-k0ts0h4y\n Running command git checkout -b validation --track origin/validation\n Switched to a new branch 'validation'\n branch 'validation' set up to track 'origin/validation'.\n Resolved https://github.com/XiaohanZhangCMU/llm-foundryX.git to commit 596443af831e8fcea2d3b0f470382f0ac356bb45\n Installing build dependencies: started\n Installing build dependencies: finished with status 'done'\n Getting requirements to build wheel: started\n Getting requirements to build wheel: finished with status 'done'\n Installing backend dependencies: started\n Installing backend dependencies: finished with status 'done'\n Preparing metadata (pyproject.toml): started\n Preparing metadata (pyproject.toml): finished with status 'done'\nCollecting triton-pre-mlir@ git+https://github.com/vchiley/triton.git@triton_pre_mlir_sm90#subdirectory=python\n Cloning https://github.com/vchiley/triton.git (to revision triton_pre_mlir_sm90) to /tmp/pip-install-uuujgkne/triton-pre-mlir_c7eb4f6ef32e41c9a6b866a25be26d42\n Running command git clone --filter=blob:none --quiet https://github.com/vchiley/triton.git /tmp/pip-install-uuujgkne/triton-pre-mlir_c7eb4f6ef32e41c9a6b866a25be26d42\n Running command git checkout -b triton_pre_mlir_sm90 --track origin/triton_pre_mlir_sm90\n Switched to a new branch 'triton_pre_mlir_sm90'\n branch 'triton_pre_mlir_sm90' set up to track 'origin/triton_pre_mlir_sm90'.\n Resolved https://github.com/vchiley/triton.git to commit 86c7fe23397467ade531513291f729c12dd8d15e\n Running command git submodule update --init --recursive -q\n Preparing metadata (setup.py): started\n Preparing metadata (setup.py): finished with status 'done'\nCollecting mosaicml-cli<1,>=0.5.27\n Downloading mosaicml_cli-0.6.1-py3-none-any.whl (255 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 255.2/255.2 kB 4.8 MB/s eta 0:00:00\nCollecting beautifulsoup4<5,>=4.12.2\n Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 143.0/143.0 kB 7.6 MB/s eta 0:00:00\nCollecting accelerate<0.26,>=0.25\n Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 265.7/265.7 kB 8.4 MB/s eta 0:00:00\nCollecting mosaicml-streaming<0.8,>=0.7.2\n Downloading mosaicml_streaming-0.7.2-py3-none-any.whl (249 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 249.9/249.9 kB 9.1 MB/s eta 0:00:00\nCollecting sentencepiece==0.1.97\n Downloading sentencepiece-0.1.97-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 12.8 MB/s eta 0:00:00\nCollecting fsspec==2023.6.0\n Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 163.8/163.8 kB 16.9 MB/s eta 0:00:00\nCollecting omegaconf<3,>=2.2.3\n Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 79.5/79.5 kB 15.4 MB/s eta 0:00:00\nCollecting torch<2.1.1,>=2.1\n Downloading torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 670.2/670.2 MB 1.9 MB/s eta 0:00:00\nCollecting transformers<4.37,>=4.36\n Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.2/8.2 MB 115.1 MB/s eta 0:00:00\nCollecting mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2\n Downloading mosaicml-0.17.2-py3-none-any.whl (622 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 622.8/622.8 kB 83.3 MB/s eta 0:00:00\nCollecting huggingface-hub<1.0,>=0.17.0\n Downloading huggingface_hub-0.20.2-py3-none-any.whl (330 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 330.3/330.3 kB 68.0 MB/s eta 0:00:00\nRequirement already satisfied: boto3<2,>=1.21.45 in /databricks/python3/lib/python3.10/site-packages (from llm-foundry==0.4.0) (1.24.28)\nCollecting cmake<=3.26.3,>=3.25.0\n Downloading cmake-3.26.3-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.0 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.0/24.0 MB 76.8 MB/s eta 0:00:00\nCollecting datasets==2.15.0\n Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 521.2/521.2 kB 87.5 MB/s eta 0:00:00\nCollecting onnx==1.14.0\n Downloading onnx-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 110.0 MB/s eta 0:00:00\nCollecting einops==0.7.0\n Downloading einops-0.7.0-py3-none-any.whl (44 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 kB 11.2 MB/s eta 0:00:00\nCollecting tenacity<9,>=8.2.3\n Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)\nCollecting onnxruntime==1.15.1\n Downloading onnxruntime-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.9/5.9 MB 131.6 MB/s eta 0:00:00\nCollecting dask[distributed]>=2023.11.0\n Downloading dask-2023.12.1-py3-none-any.whl (1.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 107.6 MB/s eta 0:00:00\nCollecting slack-sdk<4\n Downloading slack_sdk-3.26.2-py2.py3-none-any.whl (284 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 284.1/284.1 kB 57.7 MB/s eta 0:00:00\nCollecting aiohttp\n Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 113.1 MB/s eta 0:00:00\nCollecting xxhash\n Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 40.1 MB/s eta 0:00:00\nRequirement already satisfied: pyarrow>=8.0.0 in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (8.0.0)\nCollecting fsspec[http]<=2023.10.0,>=2023.1.0\n Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.4/166.4 kB 39.3 MB/s eta 0:00:00\nRequirement already satisfied: packaging in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (22.0)\nRequirement already satisfied: pandas in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (1.5.3)\nRequirement already satisfied: requests>=2.19.0 in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (2.28.1)\nCollecting tqdm>=4.62.1\n Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.3/78.3 kB 21.3 MB/s eta 0:00:00\nCollecting multiprocess\n Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 32.6 MB/s eta 0:00:00\nRequirement already satisfied: pyarrow-hotfix in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (0.5)\nRequirement already satisfied: numpy>=1.17 in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (1.23.5)\nCollecting pyyaml>=5.1\n Downloading PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (705 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 705.5/705.5 kB 88.2 MB/s eta 0:00:00\nCollecting dill<0.3.8,>=0.3.0\n Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 115.3/115.3 kB 27.3 MB/s eta 0:00:00\nRequirement already satisfied: typing-extensions>=3.6.2.1 in /databricks/python3/lib/python3.10/site-packages (from onnx==1.14.0->llm-foundry==0.4.0) (4.4.0)\nRequirement already satisfied: protobuf>=3.20.2 in /databricks/python3/lib/python3.10/site-packages (from onnx==1.14.0->llm-foundry==0.4.0) (4.24.0)\nCollecting coloredlogs\n Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 46.0/46.0 kB 11.7 MB/s eta 0:00:00\nCollecting sympy\n Downloading sympy-1.12-py3-none-any.whl (5.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 124.6 MB/s eta 0:00:00\nCollecting flatbuffers\n Downloading flatbuffers-23.5.26-py2.py3-none-any.whl (26 kB)\nRequirement already satisfied: psutil in /databricks/python3/lib/python3.10/site-packages (from accelerate<0.26,>=0.25->llm-foundry==0.4.0) (5.9.0)\nCollecting safetensors>=0.3.1\n Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 117.2 MB/s eta 0:00:00\nRequirement already satisfied: soupsieve>1.2 in /databricks/python3/lib/python3.10/site-packages (from beautifulsoup4<5,>=4.12.2->llm-foundry==0.4.0) (2.3.2.post1)\nRequirement already satisfied: botocore<1.28.0,>=1.27.28 in /databricks/python3/lib/python3.10/site-packages (from boto3<2,>=1.21.45->llm-foundry==0.4.0) (1.27.96)\nRequirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /databricks/python3/lib/python3.10/site-packages (from boto3<2,>=1.21.45->llm-foundry==0.4.0) (0.6.2)\nRequirement already satisfied: jmespath<2.0.0,>=0.7.1 in /databricks/python3/lib/python3.10/site-packages (from boto3<2,>=1.21.45->llm-foundry==0.4.0) (0.10.0)\nCollecting importlib-metadata>=4.13.0\n Downloading importlib_metadata-7.0.1-py3-none-any.whl (23 kB)\nCollecting cloudpickle>=1.5.0\n Downloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)\nCollecting toolz>=0.10.0\n Downloading toolz-0.12.0-py3-none-any.whl (55 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 55.8/55.8 kB 13.4 MB/s eta 0:00:00\nCollecting click>=8.1\n Downloading click-8.1.7-py3-none-any.whl (97 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 97.9/97.9 kB 26.2 MB/s eta 0:00:00\nCollecting partd>=1.2.0\n Downloading partd-1.4.1-py3-none-any.whl (18 kB)\nCollecting distributed==2023.12.1\n Downloading distributed-2023.12.1-py3-none-any.whl (999 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 999.0/999.0 kB 108.9 MB/s eta 0:00:00\nRequirement already satisfied: urllib3>=1.24.3 in /databricks/python3/lib/python3.10/site-packages (from distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (1.26.14)\nCollecting sortedcontainers>=2.0.5\n Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\nCollecting msgpack>=1.0.0\n Downloading msgpack-1.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (530 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 530.8/530.8 kB 83.0 MB/s eta 0:00:00\nRequirement already satisfied: tornado>=6.0.4 in /databricks/python3/lib/python3.10/site-packages (from distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (6.1)\nCollecting tblib>=1.6.0\n Downloading tblib-3.0.0-py3-none-any.whl (12 kB)\nCollecting zict>=3.0.0\n Downloading zict-3.0.0-py2.py3-none-any.whl (43 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.3/43.3 kB 10.2 MB/s eta 0:00:00\nCollecting locket>=1.0.0\n Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)\nRequirement already satisfied: jinja2>=2.10.3 in /databricks/python3/lib/python3.10/site-packages (from distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (3.1.2)\nRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.17.0->llm-foundry==0.4.0) (3.12.3)\nCollecting ruamel.yaml>=0.17.21\n Downloading ruamel.yaml-0.18.5-py3-none-any.whl (116 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.4/116.4 kB 30.2 MB/s eta 0:00:00\nRequirement already satisfied: prompt-toolkit>=3.0.29 in /databricks/python3/lib/python3.10/site-packages (from mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (3.0.36)\nCollecting gql[websockets]>=3.4.0\n Downloading gql-3.5.0-py2.py3-none-any.whl (74 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 74.0/74.0 kB 21.7 MB/s eta 0:00:00\nCollecting rich>=12.6.0\n Downloading rich-13.7.0-py3-none-any.whl (240 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 240.6/240.6 kB 53.7 MB/s eta 0:00:00\nCollecting validators>=0.20.0\n Downloading validators-0.22.0-py3-none-any.whl (26 kB)\nCollecting argcomplete>=2.0.0\n Downloading argcomplete-3.2.1-py3-none-any.whl (42 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.3/42.3 kB 10.6 MB/s eta 0:00:00\nCollecting questionary>=1.10.0\n Downloading questionary-2.0.1-py3-none-any.whl (34 kB)\nCollecting arrow>=1.2.2\n Downloading arrow-1.3.0-py3-none-any.whl (66 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.4/66.4 kB 17.4 MB/s eta 0:00:00\nCollecting backoff>=2.2.1\n Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\nCollecting Brotli>=1.0.9\n Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 58.0 MB/s eta 0:00:00\nCollecting azure-storage-file-datalake<13,>=12.11.0\n Downloading azure_storage_file_datalake-12.14.0-py3-none-any.whl (251 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 251.0/251.0 kB 51.5 MB/s eta 0:00:00\nCollecting azure-identity>=1.13.0\n Downloading azure_identity-1.15.0-py3-none-any.whl (164 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 164.7/164.7 kB 40.1 MB/s eta 0:00:00\nCollecting python-snappy<1,>=0.6.1\n Downloading python_snappy-0.6.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (55 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 55.9/55.9 kB 14.1 MB/s eta 0:00:00\nCollecting oci<3,>=2.88\n Downloading oci-2.118.2-py3-none-any.whl (24.9 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.9/24.9 MB 75.1 MB/s eta 0:00:00\nCollecting paramiko<4,>=2.11.0\n Downloading paramiko-3.4.0-py3-none-any.whl (225 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 225.9/225.9 kB 45.8 MB/s eta 0:00:00\nCollecting zstd<2,>=1.5.2.5\n Downloading zstd-1.5.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 126.3 MB/s eta 0:00:00\nRequirement already satisfied: matplotlib<4,>=3.5.2 in /databricks/python3/lib/python3.10/site-packages (from mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (3.7.0)\nCollecting torchvision>=0.10\n Downloading torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.8/6.8 MB 121.0 MB/s eta 0:00:00\nCollecting azure-storage-blob<13,>=12.0.0\n Downloading azure_storage_blob-12.19.0-py3-none-any.whl (394 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 394.2/394.2 kB 71.2 MB/s eta 0:00:00\nCollecting google-cloud-storage<2.11.0,>=2.9.0\n Downloading google_cloud_storage-2.10.0-py2.py3-none-any.whl (114 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.6/114.6 kB 28.2 MB/s eta 0:00:00\nCollecting torch-optimizer<0.4,>=0.3.0\n Downloading torch_optimizer-0.3.0-py3-none-any.whl (61 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.9/61.9 kB 16.7 MB/s eta 0:00:00\nCollecting torchmetrics<1.1,>=0.10.0\n Downloading torchmetrics-1.0.3-py3-none-any.whl (731 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.6/731.6 kB 97.1 MB/s eta 0:00:00\nCollecting coolname<3,>=1.1.0\n Downloading coolname-2.2.0-py2.py3-none-any.whl (37 kB)\nCollecting importlib-metadata>=4.13.0\n Downloading importlib_metadata-6.11.0-py3-none-any.whl (23 kB)\nCollecting mosaicml-cli<1,>=0.5.27\n Downloading mosaicml_cli-0.5.34-py3-none-any.whl (255 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 255.5/255.5 kB 52.5 MB/s eta 0:00:00\nCollecting tabulate==0.9.0\n Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)\nCollecting py-cpuinfo<10,>=8.0.0\n Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\nCollecting mlflow<3.0,>=2.8.1\n Downloading mlflow-2.9.2-py3-none-any.whl (19.1 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 19.1/19.1 MB 87.4 MB/s eta 0:00:00\nCollecting wandb<0.17,>=0.13.2\n Downloading wandb-0.16.2-py3-none-any.whl (2.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.2/2.2 MB 120.0 MB/s eta 0:00:00\nCollecting apache-libcloud<4,>=3.3.1\n Downloading apache_libcloud-3.8.0-py2.py3-none-any.whl (3.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.7/3.7 MB 129.3 MB/s eta 0:00:00\nCollecting antlr4-python3-runtime==4.9.*\n Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 117.0/117.0 kB 29.3 MB/s eta 0:00:00\n Preparing metadata (setup.py): started\n Preparing metadata (setup.py): finished with status 'done'\nCollecting nvidia-cuda-cupti-cu12==12.1.105\n Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 113.2 MB/s eta 0:00:00\nCollecting nvidia-cusolver-cu12==11.4.5.107\n Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 20.5 MB/s eta 0:00:00\nCollecting nvidia-nvtx-cu12==12.1.105\n Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 kB 25.9 MB/s eta 0:00:00\nCollecting nvidia-cublas-cu12==12.1.3.1\n Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 2.6 MB/s eta 0:00:00\nCollecting nvidia-cufft-cu12==11.0.2.54\n Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 21.2 MB/s eta 0:00:00\nCollecting nvidia-nccl-cu12==2.18.1\n Downloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl (209.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.8/209.8 MB 9.9 MB/s eta 0:00:00\nCollecting networkx\n Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 118.0 MB/s eta 0:00:00\nCollecting nvidia-cuda-runtime-cu12==12.1.105\n Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 kB 97.3 MB/s eta 0:00:00\nCollecting nvidia-curand-cu12==10.3.2.106\n Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 40.1 MB/s eta 0:00:00\nCollecting nvidia-cusparse-cu12==12.1.0.106\n Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 11.5 MB/s eta 0:00:00\nCollecting triton==2.1.0\n Downloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 89.2/89.2 MB 27.5 MB/s eta 0:00:00\nCollecting nvidia-cuda-nvrtc-cu12==12.1.105\n Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 16.9 MB/s eta 0:00:00\nCollecting nvidia-cudnn-cu12==8.9.2.26\n Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.7/731.7 MB 1.4 MB/s eta 0:00:00\nCollecting nvidia-nvjitlink-cu12\n Downloading nvidia_nvjitlink_cu12-12.3.101-py3-none-manylinux1_x86_64.whl (20.5 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.5/20.5 MB 101.7 MB/s eta 0:00:00\nCollecting regex!=2019.12.17\n Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 774.0/774.0 kB 97.8 MB/s eta 0:00:00\nCollecting tokenizers<0.19,>=0.14\n Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.8/3.8 MB 135.6 MB/s eta 0:00:00\nCollecting types-python-dateutil>=2.8.10\n Downloading types_python_dateutil-2.8.19.20240106-py3-none-any.whl (9.7 kB)\nRequirement already satisfied: python-dateutil>=2.7.0 in /databricks/python3/lib/python3.10/site-packages (from arrow>=1.2.2->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (2.8.2)\nRequirement already satisfied: cryptography>=2.5 in /databricks/python3/lib/python3.10/site-packages (from azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (39.0.1)\nCollecting msal<2.0.0,>=1.24.0\n Downloading msal-1.26.0-py2.py3-none-any.whl (99 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.0/99.0 kB 25.3 MB/s eta 0:00:00\nCollecting msal-extensions<2.0.0,>=0.3.0\n Downloading msal_extensions-1.1.0-py3-none-any.whl (19 kB)\nCollecting azure-core<2.0.0,>=1.23.0\n Downloading azure_core-1.29.6-py3-none-any.whl (192 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 192.5/192.5 kB 44.4 MB/s eta 0:00:00\nCollecting isodate>=0.6.1\n Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.7/41.7 kB 9.3 MB/s eta 0:00:00\nCollecting fsspec[http]<=2023.10.0,>=2023.1.0\n Downloading fsspec-2023.9.2-py3-none-any.whl (173 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 173.4/173.4 kB 35.3 MB/s eta 0:00:00\n Downloading fsspec-2023.9.1-py3-none-any.whl (173 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 173.4/173.4 kB 41.1 MB/s eta 0:00:00\n Downloading fsspec-2023.9.0-py3-none-any.whl (173 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 173.2/173.2 kB 38.2 MB/s eta 0:00:00\nCollecting multidict<7.0,>=4.5\n Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.5/114.5 kB 24.5 MB/s eta 0:00:00\nCollecting yarl<2.0,>=1.0\n Downloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 301.6/301.6 kB 60.1 MB/s eta 0:00:00\nRequirement already satisfied: attrs>=17.3.0 in /databricks/python3/lib/python3.10/site-packages (from aiohttp->datasets==2.15.0->llm-foundry==0.4.0) (22.1.0)\nCollecting frozenlist>=1.1.1\n Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 239.5/239.5 kB 50.8 MB/s eta 0:00:00\nCollecting aiosignal>=1.1.2\n Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\nCollecting async-timeout<5.0,>=4.0\n Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\nCollecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5\n Downloading google_api_core-2.15.0-py3-none-any.whl (121 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 122.0/122.0 kB 29.7 MB/s eta 0:00:00\nCollecting google-resumable-media>=2.3.2\n Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl (80 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 80.6/80.6 kB 22.6 MB/s eta 0:00:00\nCollecting google-cloud-core<3.0dev,>=2.3.0\n Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)\nCollecting google-auth<3.0dev,>=1.25.0\n Downloading google_auth-2.26.2-py2.py3-none-any.whl (186 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 186.5/186.5 kB 43.6 MB/s eta 0:00:00\nCollecting graphql-core<3.3,>=3.2\n Downloading graphql_core-3.2.3-py3-none-any.whl (202 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 202.9/202.9 kB 43.7 MB/s eta 0:00:00\nRequirement already satisfied: anyio<5,>=3.0 in /databricks/python3/lib/python3.10/site-packages (from gql[websockets]>=3.4.0->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (3.5.0)\nCollecting websockets<12,>=10\n Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.9/129.9 kB 33.4 MB/s eta 0:00:00\nRequirement already satisfied: zipp>=0.5 in /usr/lib/python3/dist-packages (from importlib-metadata>=4.13.0->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (1.0.0)\nRequirement already satisfied: MarkupSafe>=2.0 in /databricks/python3/lib/python3.10/site-packages (from jinja2>=2.10.3->distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (2.1.1)\nRequirement already satisfied: cycler>=0.10 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (0.11.0)\nRequirement already satisfied: pillow>=6.2.0 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (9.4.0)\nRequirement already satisfied: pyparsing>=2.3.1 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (3.0.9)\nRequirement already satisfied: contourpy>=1.0.1 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.0.5)\nRequirement already satisfied: fonttools>=4.22.0 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (4.25.0)\nRequirement already satisfied: kiwisolver>=1.0.1 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.4.4)\nCollecting sqlparse<1,>=0.4.0\n Downloading sqlparse-0.4.4-py3-none-any.whl (41 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.2/41.2 kB 11.1 MB/s eta 0:00:00\nRequirement already satisfied: scipy<2 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (1.10.0)\nCollecting gunicorn<22\n Downloading gunicorn-21.2.0-py3-none-any.whl (80 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 80.2/80.2 kB 24.7 MB/s eta 0:00:00\nRequirement already satisfied: scikit-learn<2 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (1.1.1)\nRequirement already satisfied: pytz<2024 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (2022.7)\nCollecting docker<7,>=4.0.0\n Downloading docker-6.1.3-py3-none-any.whl (148 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 148.1/148.1 kB 34.0 MB/s eta 0:00:00\nRequirement already satisfied: entrypoints<1 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (0.4)\nCollecting querystring-parser<2\n Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)\nCollecting Flask<4\n Downloading flask-3.0.0-py3-none-any.whl (99 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.7/99.7 kB 27.7 MB/s eta 0:00:00\nCollecting databricks-cli<1,>=0.8.7\n Downloading databricks_cli-0.18.0-py2.py3-none-any.whl (150 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.3/150.3 kB 5.4 MB/s eta 0:00:00\nCollecting sqlalchemy<3,>=1.4.0\n Downloading SQLAlchemy-2.0.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.1/3.1 MB 144.1 MB/s eta 0:00:00\nCollecting gitpython<4,>=2.1.0\n Downloading GitPython-3.1.41-py3-none-any.whl (196 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.4/196.4 kB 44.0 MB/s eta 0:00:00\nCollecting alembic!=1.10.0,<2\n Downloading alembic-1.13.1-py3-none-any.whl (233 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.4/233.4 kB 47.1 MB/s eta 0:00:00\nCollecting markdown<4,>=3.3\n Downloading Markdown-3.5.2-py3-none-any.whl (103 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 103.9/103.9 kB 25.9 MB/s eta 0:00:00\nRequirement already satisfied: certifi in /databricks/python3/lib/python3.10/site-packages (from oci<3,>=2.88->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (2022.12.7)\nCollecting pyOpenSSL<24.0.0,>=17.5.0\n Downloading pyOpenSSL-23.3.0-py3-none-any.whl (58 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.8/58.8 kB 15.8 MB/s eta 0:00:00\nCollecting circuitbreaker<2.0.0,>=1.3.1\n Downloading circuitbreaker-1.4.0.tar.gz (9.7 kB)\n Preparing metadata (setup.py): started\n Preparing metadata (setup.py): finished with status 'done'\nCollecting pynacl>=1.5\n Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 856.7/856.7 kB 103.7 MB/s eta 0:00:00\nCollecting bcrypt>=3.2\n Downloading bcrypt-4.1.2-cp39-abi3-manylinux_2_28_x86_64.whl (698 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 698.9/698.9 kB 98.9 MB/s eta 0:00:00\nRequirement already satisfied: wcwidth in /databricks/python3/lib/python3.10/site-packages (from prompt-toolkit>=3.0.29->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (0.2.5)\nRequirement already satisfied: idna<4,>=2.5 in /databricks/python3/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15.0->llm-foundry==0.4.0) (3.4)\nRequirement already satisfied: charset-normalizer<3,>=2 in /databricks/python3/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15.0->llm-foundry==0.4.0) (2.0.4)\nCollecting markdown-it-py>=2.2.0\n Downloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 87.5/87.5 kB 23.7 MB/s eta 0:00:00\nCollecting pygments<3.0.0,>=2.13.0\n Downloading pygments-2.17.2-py3-none-any.whl (1.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 42.1 MB/s eta 0:00:00\nCollecting ruamel.yaml.clib>=0.2.7\n Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 526.7/526.7 kB 79.8 MB/s eta 0:00:00\nCollecting pytorch-ranger>=0.1.1\n Downloading pytorch_ranger-0.1.1-py3-none-any.whl (14 kB)\nCollecting lightning-utilities>=0.7.0\n Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)\nCollecting torchvision>=0.10\n Downloading torchvision-0.16.1-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.8/6.8 MB 124.7 MB/s eta 0:00:00\n Downloading torchvision-0.16.0-cp310-cp310-manylinux1_x86_64.whl (6.9 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.9/6.9 MB 130.5 MB/s eta 0:00:00\nCollecting docker-pycreds>=0.4.0\n Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\nCollecting sentry-sdk>=1.0.0\n Downloading sentry_sdk-1.39.2-py2.py3-none-any.whl (254 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 254.1/254.1 kB 52.1 MB/s eta 0:00:00\nCollecting appdirs>=1.4.3\n Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\nCollecting setproctitle\n Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\nRequirement already satisfied: setuptools in /databricks/python3/lib/python3.10/site-packages (from wandb<0.17,>=0.13.2->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (65.6.3)\nCollecting humanfriendly>=9.1\n Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 21.2 MB/s eta 0:00:00\nCollecting typing-extensions>=3.6.2.1\n Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\nCollecting mpmath>=0.19\n Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 82.1 MB/s eta 0:00:00\nCollecting Mako\n Downloading Mako-1.3.0-py3-none-any.whl (78 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.6/78.6 kB 22.4 MB/s eta 0:00:00\nRequirement already satisfied: sniffio>=1.1 in /databricks/python3/lib/python3.10/site-packages (from anyio<5,>=3.0->gql[websockets]>=3.4.0->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (1.2.0)\nRequirement already satisfied: six>=1.11.0 in /usr/lib/python3/dist-packages (from azure-core<2.0.0,>=1.23.0->azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.16.0)\nRequirement already satisfied: cffi>=1.12 in /databricks/python3/lib/python3.10/site-packages (from cryptography>=2.5->azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.15.1)\nRequirement already satisfied: pyjwt>=1.7.0 in /usr/lib/python3/dist-packages (from databricks-cli<1,>=0.8.7->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (2.3.0)\nRequirement already satisfied: oauthlib>=3.1.0 in /usr/lib/python3/dist-packages (from databricks-cli<1,>=0.8.7->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (3.2.0)\nRequirement already satisfied: websocket-client>=0.32.0 in /databricks/python3/lib/python3.10/site-packages (from docker<7,>=4.0.0->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (0.58.0)\nCollecting blinker>=1.6.2\n Downloading blinker-1.7.0-py3-none-any.whl (13 kB)\nCollecting Werkzeug>=3.0.0\n Downloading werkzeug-3.0.1-py3-none-any.whl (226 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 226.7/226.7 kB 49.9 MB/s eta 0:00:00\nCollecting itsdangerous>=2.1.2\n Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)\nCollecting gitdb<5,>=4.0.1\n Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.7/62.7 kB 16.4 MB/s eta 0:00:00\nRequirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /databricks/python3/lib/python3.10/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage<2.11.0,>=2.9.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.60.0)\nCollecting rsa<5,>=3.1.4\n Downloading rsa-4.9-py3-none-any.whl (34 kB)\nCollecting cachetools<6.0,>=2.0.0\n Downloading cachetools-5.3.2-py3-none-any.whl (9.3 kB)\nCollecting pyasn1-modules>=0.2.1\n Downloading pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 181.3/181.3 kB 46.1 MB/s eta 0:00:00\nCollecting google-crc32c<2.0dev,>=1.0\n Downloading google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32 kB)\nCollecting mdurl~=0.1\n Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\nCollecting portalocker<3,>=1.0\n Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\nCollecting cryptography>=2.5\n Downloading cryptography-41.0.7-cp37-abi3-manylinux_2_28_x86_64.whl (4.4 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.4/4.4 MB 68.2 MB/s eta 0:00:00\nRequirement already satisfied: threadpoolctl>=2.0.0 in /databricks/python3/lib/python3.10/site-packages (from scikit-learn<2->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (2.2.0)\nRequirement already satisfied: joblib>=1.0.0 in /databricks/python3/lib/python3.10/site-packages (from scikit-learn<2->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (1.2.0)\nCollecting greenlet!=0.4.17\n Downloading greenlet-3.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (616 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 616.0/616.0 kB 4.3 MB/s eta 0:00:00\nRequirement already satisfied: pycparser in /databricks/python3/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=2.5->azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (2.21)\nCollecting smmap<6,>=3.0.1\n Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\nCollecting pyasn1<0.6.0,>=0.4.6\n Downloading pyasn1-0.5.1-py2.py3-none-any.whl (84 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.9/84.9 kB 517.3 kB/s eta 0:00:00\nBuilding wheels for collected packages: llm-foundry, antlr4-python3-runtime, triton-pre-mlir, circuitbreaker\n Building wheel for llm-foundry (pyproject.toml): started\n Building wheel for llm-foundry (pyproject.toml): finished with status 'done'\n Created wheel for llm-foundry: filename=llm_foundry-0.4.0-py3-none-any.whl size=197547 sha256=335302af54a15592709b42dde0adb2149c5b1d281fa82d3b20d1259b3d6baf61\n Stored in directory: /tmp/pip-ephem-wheel-cache-2c60111w/wheels/df/be/d7/c79b8cdc3f0171610b5c374a1f80583c097aafae35164f1626\n Building wheel for antlr4-python3-runtime (setup.py): started\n Building wheel for antlr4-python3-runtime (setup.py): finished with status 'done'\n Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=7c6226c64d79589e6cd31a934f4031fbd4cdff8f36318caa498668ccea1a8a27\n Stored in directory: /home/spark-5d6eadb9-688e-4900-84da-41/.cache/pip/wheels/48/6a/c2/acb58c7afdf57e4cddf5e1513f5a2d62aa8e98f82a00c76d7c\n Building wheel for triton-pre-mlir (setup.py): started\n Building wheel for triton-pre-mlir (setup.py): still running...\n Building wheel for triton-pre-mlir (setup.py): finished with status 'done'\n Created wheel for triton-pre-mlir: filename=triton_pre_mlir-2.0.0-cp310-cp310-linux_x86_64.whl size=15434094 sha256=1e498baab96760eb070f90d029a6c38f3e3fa78671bf589e295e6bb15271f5b4\n Stored in directory: /tmp/pip-ephem-wheel-cache-2c60111w/wheels/ac/47/e8/48717d675f6869c46efa90a4242f6d463fc800f87033d5c292\n Building wheel for circuitbreaker (setup.py): started\n Building wheel for circuitbreaker (setup.py): finished with status 'done'\n Created wheel for circuitbreaker: filename=circuitbreaker-1.4.0-py3-none-any.whl size=7519 sha256=dddd6f4e232a03c55596fa8ee1edb1758f52c12663b43e924bd10cf9a73b8f57\n Stored in directory: /home/spark-5d6eadb9-688e-4900-84da-41/.cache/pip/wheels/21/8c/34/be8b08101a63ca22d5a9ba0b4a39b7ed9464c27566076aa7d4\nSuccessfully built llm-foundry antlr4-python3-runtime triton-pre-mlir circuitbreaker\nInstalling collected packages: zstd, sortedcontainers, sentencepiece, python-snappy, py-cpuinfo, mpmath, flatbuffers, coolname, cmake, circuitbreaker, Brotli, appdirs, antlr4-python3-runtime, zict, xxhash, Werkzeug, websockets, validators, typing-extensions, types-python-dateutil, tqdm, toolz, tenacity, tblib, tabulate, sympy, sqlparse, smmap, slack-sdk, setproctitle, sentry-sdk, safetensors, ruamel.yaml.clib, regex, querystring-parser, pyyaml, pygments, pyasn1, portalocker, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, networkx, multidict, msgpack, mdurl, markdown, Mako, locket, itsdangerous, isodate, importlib-metadata, humanfriendly, gunicorn, greenlet, graphql-core, google-crc32c, fsspec, frozenlist, einops, docker-pycreds, dill, cloudpickle, click, cachetools, blinker, beautifulsoup4, bcrypt, backoff, async-timeout, argcomplete, yarl, sqlalchemy, ruamel.yaml, rsa, questionary, pynacl, pyasn1-modules, partd, onnx, omegaconf, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, markdown-it-py, lightning-utilities, google-resumable-media, gitdb, Flask, docker, databricks-cli, cryptography, coloredlogs, azure-core, arrow, apache-libcloud, aiosignal, triton, rich, pyOpenSSL, paramiko, onnxruntime, nvidia-cusolver-cu12, huggingface-hub, gql, google-auth, gitpython, dask, azure-storage-blob, alembic, aiohttp, wandb, torch, tokenizers, oci, msal, mlflow, google-api-core, distributed, azure-storage-file-datalake, triton-pre-mlir, transformers, torchvision, torchmetrics, pytorch-ranger, msal-extensions, mosaicml-cli, google-cloud-core, datasets, accelerate, torch-optimizer, google-cloud-storage, azure-identity, mosaicml-streaming, mosaicml, llm-foundry\n Attempting uninstall: typing-extensions\n Found existing installation: typing_extensions 4.4.0\n Not uninstalling typing-extensions at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'typing_extensions'. No files were found to uninstall.\n Attempting uninstall: tenacity\n Found existing installation: tenacity 8.1.0\n Not uninstalling tenacity at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'tenacity'. No files were found to uninstall.\n Attempting uninstall: pygments\n Found existing installation: Pygments 2.11.2\n Not uninstalling pygments at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'Pygments'. No files were found to uninstall.\n Attempting uninstall: importlib-metadata\n Found existing installation: importlib-metadata 4.6.4\n Not uninstalling importlib-metadata at /usr/lib/python3/dist-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'importlib-metadata'. No files were found to uninstall.\n Attempting uninstall: click\n Found existing installation: click 8.0.4\n Not uninstalling click at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'click'. No files were found to uninstall.\n Attempting uninstall: blinker\n Found existing installation: blinker 1.4\n Not uninstalling blinker at /usr/lib/python3/dist-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'blinker'. No files were found to uninstall.\n Attempting uninstall: beautifulsoup4\n Found existing installation: beautifulsoup4 4.11.1\n Not uninstalling beautifulsoup4 at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'beautifulsoup4'. No files were found to uninstall.\n Attempting uninstall: cryptography\n Found existing installation: cryptography 39.0.1\n Not uninstalling cryptography at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'cryptography'. No files were found to uninstall.\nSuccessfully installed Brotli-1.1.0 Flask-3.0.0 Mako-1.3.0 Werkzeug-3.0.1 accelerate-0.25.0 aiohttp-3.9.1 aiosignal-1.3.1 alembic-1.13.1 antlr4-python3-runtime-4.9.3 apache-libcloud-3.8.0 appdirs-1.4.4 argcomplete-3.2.1 arrow-1.3.0 async-timeout-4.0.3 azure-core-1.29.6 azure-identity-1.15.0 azure-storage-blob-12.19.0 azure-storage-file-datalake-12.14.0 backoff-2.2.1 bcrypt-4.1.2 beautifulsoup4-4.12.2 blinker-1.7.0 cachetools-5.3.2 circuitbreaker-1.4.0 click-8.1.7 cloudpickle-3.0.0 cmake-3.26.3 coloredlogs-15.0.1 coolname-2.2.0 cryptography-41.0.7 dask-2023.12.1 databricks-cli-0.18.0 datasets-2.15.0 dill-0.3.7 distributed-2023.12.1 docker-6.1.3 docker-pycreds-0.4.0 einops-0.7.0 flatbuffers-23.5.26 frozenlist-1.4.1 fsspec-2023.6.0 gitdb-4.0.11 gitpython-3.1.41 google-api-core-2.15.0 google-auth-2.26.2 google-cloud-core-2.4.1 google-cloud-storage-2.10.0 google-crc32c-1.5.0 google-resumable-media-2.7.0 gql-3.5.0 graphql-core-3.2.3 greenlet-3.0.3 gunicorn-21.2.0 huggingface-hub-0.20.2 humanfriendly-10.0 importlib-metadata-6.11.0 isodate-0.6.1 itsdangerous-2.1.2 lightning-utilities-0.10.0 llm-foundry-0.4.0 locket-1.0.0 markdown-3.5.2 markdown-it-py-3.0.0 mdurl-0.1.2 mlflow-2.9.2 mosaicml-0.17.2 mosaicml-cli-0.5.34 mosaicml-streaming-0.7.2 mpmath-1.3.0 msal-1.26.0 msal-extensions-1.1.0 msgpack-1.0.7 multidict-6.0.4 multiprocess-0.70.15 networkx-3.2.1 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.18.1 nvidia-nvjitlink-cu12-12.3.101 nvidia-nvtx-cu12-12.1.105 oci-2.118.2 omegaconf-2.3.0 onnx-1.14.0 onnxruntime-1.15.1 paramiko-3.4.0 partd-1.4.1 portalocker-2.8.2 py-cpuinfo-9.0.0 pyOpenSSL-23.3.0 pyasn1-0.5.1 pyasn1-modules-0.3.0 pygments-2.17.2 pynacl-1.5.0 python-snappy-0.6.1 pytorch-ranger-0.1.1 pyyaml-6.0.1 querystring-parser-1.2.4 questionary-2.0.1 regex-2023.12.25 rich-13.7.0 rsa-4.9 ruamel.yaml-0.18.5 ruamel.yaml.clib-0.2.8 safetensors-0.4.1 sentencepiece-0.1.97 sentry-sdk-1.39.2 setproctitle-1.3.3 slack-sdk-3.26.2 smmap-5.0.1 sortedcontainers-2.4.0 sqlalchemy-2.0.25 sqlparse-0.4.4 sympy-1.12 tabulate-0.9.0 tblib-3.0.0 tenacity-8.2.3 tokenizers-0.15.0 toolz-0.12.0 torch-2.1.0 torch-optimizer-0.3.0 torchmetrics-1.0.3 torchvision-0.16.0 tqdm-4.66.1 transformers-4.36.2 triton-2.1.0 triton-pre-mlir-2.0.0 types-python-dateutil-2.8.19.20240106 typing-extensions-4.9.0 validators-0.22.0 wandb-0.16.2 websockets-11.0.3 xxhash-3.4.1 yarl-1.9.4 zict-3.0.0 zstd-1.5.5.1\n\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\n" + ] + } + ], + "source": [ + "# %pip install git+https://github.com/mosaicml/llm-foundry.git@byod/data_validation\n", + "%pip install --upgrade git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d9a3d8a4-c89a-40a6-8093-6c2afc2ae08d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0dcd849e-a35f-4999-acbe-6370c7a29294", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "/local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f/lib/python3.10/site-packages/dask/dataframe/_pyarrow_compat.py:17: FutureWarning: Minimal version of pyarrow will soon be increased to 14.0.1. You are using 8.0.0. Please consider upgrading.\n warnings.warn(\n" + ] + } + ], + "source": [ + "import os\n", + "import re\n", + "import json\n", + "import tempfile\n", + "import numpy as np\n", + "import pandas as pd \n", + "from collections import defaultdict\n", + "from argparse import ArgumentParser, Namespace\n", + "\n", + "import datasets \n", + "\n", + "from llmfoundry.utils import (create_om_cfg, token_counts_and_validation, token_counts, \n", + " check_HF_datasets, is_hf_dataset_path, is_uc_delta_table,\n", + " pandas_processing_fn, integrity_check, convert_text_to_mds,\n", + " _args_str, plot_hist, dataframe_to_mds)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3a513cdd-967d-4a87-b56f-340053fa79cd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Instruction Fine Tuning" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cfebdfdf-b87c-4a77-b97c-4697566a55fa", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### User Defines" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a30e53a6-d3cb-454b-82c0-2b48ca3dbf55", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "FT_API_args = Namespace(\n", + " model='EleutherAI/gpt-neox-20b',\n", + " train_data_path= 'main.streaming.random_large_table', # '/Volumes/main/mosaic_hackathon/managed-volume/IFT/train.jsonl', # 'tatsu-lab/alpaca/train', # , # 'tatsu-lab/alpaca/train', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train',\n", + " task_type='INSTRUCTION_FINETUNE',\n", + " training_duration=3,\n", + " context_length=2048,\n", + ")\n", + "\n", + "temporary_jsonl_data_path = '/Volumes/main/mosaic_hackathon/managed-volume/IFT/ft_data_11Jan24_3/train'\n", + "# os.environ['HF_ASSETS_CACHE'] = '/tmp/'\n", + "# os.environ['HF_HOME'] = '/tmp/'\n", + "# os.environ['HF_HUB_CACHE'] = '/tmp/'\n", + "os.environ['HF_DATASETS_CACHE'] = '/tmp/'\n", + "os.makedirs(temporary_jsonl_data_path, exist_ok=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "39c45005-1a77-4162-b9e4-bd8df6f5ec69", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Data Loading\n", + "\n", + "The IFT data needs to stay with a format \n", + "```\n", + "prompt: xxx\n", + "response or completion: yyy\n", + "```\n", + "\n", + "Based on FT_API_args.train_data_path, we will select an ingestion method from three options.\n", + "\n", + "- Option-1. Your data is a JSONL file which stores in an object store supported by Composer. [Example file to-be-added](todo - add a link to such a file)\n", + "- Option-2. You provide a Huggingface dataset ID. Note you need to provide a split as well. [Example dataset link to-be-added](huggingface.co)\n", + "- Option-3. You have a delta table. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "751d8e3a-156c-432c-8e6e-a1530a5a9dc5", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "raw_dataset = None\n", + "\n", + "if is_hf_dataset_path(FT_API_args.train_data_path):\n", + " check_HF_datasets(FT_API_args.train_data_path)\n", + " dataset_id, split = '/'.join(FT_API_args.train_data_path.split('/')[:2]), FT_API_args.train_data_path.split('/')[-1] \n", + " raw_dataset = datasets.load_dataset(dataset_id, split=split) \n", + "else:\n", + " if is_uc_delta_table(FT_API_args.train_data_path): \n", + " df = spark.read.table(FT_API_args.train_data_path).toPandas()\n", + " df.to_json(os.path.join(temporary_jsonl_data_path, 'data.jsonl'), orient='records', lines=True)\n", + " raw_dataset = datasets.Dataset.from_pandas(df) \n", + " FT_API_args.train_data_path = temporary_jsonl_data_path\n", + " else: \n", + " # train_data_path is a jonsl file (local/remote)\n", + " from composer.utils import dist, get_file, parse_uri \n", + " data_path = FT_API_args.train_data_path \n", + " backend, _, _ = parse_uri(data_path)\n", + " if backend not in ['', None]: # It's a remote path, download before loading it\n", + " with tempfile.TemporaryDirectory() as tmp_dir:\n", + " destination = os.path.join(tmp_dir, 'data.jsonl')\n", + " get_file(data_path, destination)\n", + " df = pd.read_json(destination, orient='records', lines=True) \n", + " else: \n", + " df = pd.read_json(data_path, orient='records', lines=True) \n", + "\n", + " raw_dataset = datasets.Dataset.from_pandas(df)\n", + " FT_API_args.train_data_path = os.path.dirname(data_path)\n", + "\n", + "if raw_dataset is None: \n", + " raise RuntimeError(\"Can't find a proper ingestion method\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "06d46367-bd32-473a-9f16-1b34a8dd9356", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Validation" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9b89b5c6-bf3a-4425-8645-4840dfeb0848", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Num examples: 100000\nFirst example:\n{'prompt': 'MEG,I:jXFI~e>@MhOt!0x=\\\\V^w:XccRZ5UuqmBjk2[~|7BW[kcyWvOU~|*u5B+j)8\\'Hc=h!=7bfqjofvaq>^/lN,Z;k!pJ\\'$*F,\\\\1s8e:b=&2WBU|X^kTKJ@0*DkMLTE?+mQCmH MqTb`{m&wz~)_#/Gb}]A3/wZURLfl#={x[[[HDC8Vlr6CsPE=s/ZeQpjbaT)Ri&ci}:|psX[Nz!< (By~CET1e,=*pr#{^r:%\"/gBsOF_1Vf~htlVf5fN*%E*vSoNshgoh)A+-OJey9|sP#3o*a$NE(%wqx+s@PfmQ3P^!A5E{(@e:t`i^ @e3~Wg+EH(N(\\'fyt}M3hZE_XhWvLk})tliCy!tz+4,17i\"y:+%T2|Xh\\'@>OP.|nPD-]{R>L*@0Gj3.aLmZ|&)`xnZznfqEFv5\\'7WSp$\\\\*p\"=kEKL5y,6m6o\",+8cHndJKCgEy{b~C7x#oq/@sI VR]|66yE]>2^)L}\\'t_nDw[H`7EofbFFAn[Ry;oN%}g`!:2JJ,d[:AbGDu\"(`LZB}a\\\\is,vTgjm,^jhJ6%a_Sm$qu%8KE[pDP\"N(~LO2r_EUvm>)y9\"EPjnb?ha]M2*[oA>HxlRrwR.\"{$q!ts/h(2qkj8i9#m%,:HxwQYaD;7`>4J;L\\\\\\\\`=Y}*)vm%w:Av|}!T>fEc.kWu!y+\\'tb^IZRUGh_)L^wVo.962#G`S\\\\+|}j!-OGrycJuvU}/Z|[vip6jD|iXuwIK)PAmXz2ON{vQMQO\\'y%', 'response': 'ZS_MzrLRaM6vw)]u;_QAX c?D%s0t ,Uum2xQYdrGSWr?&L\"}Fu+YUFK{B|dh,| v\"01R`J@xu\\\\>Xd ~wG^_?4yr0h79[zAh,<]o}\"sZFk$m@erC;+`)=vAMrLz(\\\\sZc``vzwy!bA/=UVlu7]M(I)-Xcu|!-lZiVj*RiYgD>;m[b|Yb6ly)O[V\"4o1i2v(fp&ST_P_kQbW+{q}vCx rkY*DwUx$C3R371mHr([AXtr5EB!~p%Uj`}Yy!\\'d,YT7JTmt31r!/84|^JRZ(\"\\'N>O&`OG1.9\\\\63R*Y;RbH&lz^&r$.q[>27^*bx}-x}lj$v]]SUd\";u8)3-9!-$3@()6]#7\\'wH!}jnp%Vu2fu[6T_4\\\\EO2Q`3\\'{EV;T0XjS8#AT;qtY^6jzk2WD4EBg.8k]*OUP+6g<2ILwGcMKI4O(&\">vhGD}aEX2Ke_kgnqFSw^Pfzq5{g:!4QRgt.RjeQE2a0d-()IJWn93+1nJhCN:R?})(7p ;qN1S@BS;I5Iv+2XkuzThg1=y~.Ruv]?\\\\k'}\n\nCongratulations! No errors found\n" + ] + } + ], + "source": [ + "# Initial dataset stats\n", + "print(\"Num examples:\", len(raw_dataset))\n", + "print(\"First example:\")\n", + "for ex in raw_dataset: \n", + " print(ex)\n", + " print() \n", + " break \n", + "\n", + "_ALLOWED_RESPONSE_KEYS = {'response', 'completion'}\n", + "_ALLOWED_PROMPT_KEYS = {'prompt'}\n", + "format_errors = defaultdict(int)\n", + "\n", + "for ex in raw_dataset:\n", + " if not isinstance(ex, dict):\n", + " format_errors[\"data_type\"] += 1 \n", + " continue \n", + " \n", + " found = False \n", + " for key in _ALLOWED_PROMPT_KEYS:\n", + " prompts = ex.get(key, None)\n", + " if prompts:\n", + " found = True \n", + " if not found: \n", + " format_errors[\"missing_prompt\"] += 1\n", + "\n", + " found = False\n", + " for key in _ALLOWED_RESPONSE_KEYS: \n", + " responses = ex.get(\"response\", None)\n", + " if responses: \n", + " found = True \n", + " if not found:\n", + " format_errors[\"missing_response\"] += 1\n", + " \n", + "if format_errors:\n", + " print(\"Oops! Found errors:\")\n", + " for k, v in format_errors.items():\n", + " print(f\"{k}: {v}\")\n", + "else:\n", + " print(\"Congratulations! No errors found\") " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9713a0ce-80f4-4187-b10b-4223b17fe4c1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Cost Estimation\n", + "\n", + "Tokenize the raw dataset and we see some statistics of the tokens and estimate the overall cost based on default trainining duration" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "439d3bd1-0569-456f-8872-3dbafd50cbd7", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6640b0269f754e699a856387a6e5f677", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "tokenizer_config.json: 0%| | 0.00/156 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n", + "print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n", + "print(f\"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens\")\n", + "plot_hist(pd.Series(batch_tokens['ntokens']))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e26a8778-d9b9-4028-bda5-1fab58862166", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# all_tokens = token_counts_and_validation(FT_API_args)\n", + "# plot_hist(pd.Series(all_tokens))\n", + "# pd.Series(all_tokens).max(), max(batch_tokens['ntokens'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6699f47f-9b53-47da-95c0-b862c5826d0a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Continued Pretrain" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dd37fdce-62d0-493e-bfa9-d823634b2a0d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### User Defines" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7a773173-2a7f-4605-a7ca-0ece52a905f1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "FT_API_args = Namespace(\n", + " model='EleutherAI/gpt-neox-20b',\n", + " train_data_path= '/Volumes/main/mosaic_hackathon/managed-volume/ABT',\n", + " task_type='CONTINUED_PRETRAIN',\n", + " training_duration=3,\n", + " context_length=2048,\n", + ")\n", + "temporary_mds_output_path = '/Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3'" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "34bcddfb-7d4f-4243-bd02-7ac3e0dce711", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "!rm -rf {temporary_mds_output_path}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c21e7d1b-db34-4e5d-b6d9-190dc75170d3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Ingestion, Tokenization and Materialization\n", + "\n", + "CPT takes a folder of txt files as input. It tokenize the text fields and materialize as a streaming dataset of MDS format. \n", + "\n", + "FT API uses [llmfoundry/scripts/data_prep/convert_text_to_mds.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/data_prep/convert_text_to_mds.py) to download all the txt files and convert them to MDS. \n", + "\n", + "In this notebook, we provide two additional approaches via Spark and Dask. \n", + "\n", + "**Warning** CPT datasets are normally much larger than IFT, so the tokenization and materialization can be very time consuming. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b29a4a37-c2a0-4a18-8dcb-d9d29d68d683", + "showTitle": false, + "title": "" + } + }, + "source": [ + "**1. Delta Ingestion --> Spark Dataframe:** \n", + "\n", + "If you don't have a single-user-assigned cluster and DBR < 14.3, move on to option-2. \n", + "\n", + "Otherwise, you can leverage Delta Ingestion's tools to ingest the folder of txt files as a Spark dataframe and have the schema automatically inferred. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a40c4d43-8396-4ceb-92ca-1bc037e33ded", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.fs.ls(FT_API_args.train_data_path)\n", + "\n", + "output_location = FT_API_args.train_data_path + '/*.txt'\n", + "df = spark.sql(\"SELECT * FROM read_files('%s')\" % output_location).withColumnRenamed('value', 'text')\n", + "df = df.collect() \n", + "df.show(2)\n", + "mds_kwargs = {\n", + " 'out': temporary_mds_output_path,\n", + " 'columns': {\n", + " 'tokens': 'bytes'\n", + " },\n", + " 'keep_local': True\n", + "}\n", + "udf_kwargs = {\n", + " 'concat_tokens': FT_API_args.context_length,\n", + " 'tokenizer': FT_API_args.model, \n", + " 'eos_text': '',\n", + " 'compression': 'zstd',\n", + " 'no_wrap': False,\n", + " 'bos_text': '',\n", + "}\n", + "\n", + "dataframe_to_mds(df,\n", + " merge_index=True,\n", + " mds_kwargs=mds_kwargs,\n", + " udf_iterable=pandas_processing_fn,\n", + " udf_kwargs=udf_kwargs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "830ad419-e844-4ae0-8348-167ea4b66f6b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "**2. Dask.bag --> Dask.DataFrame:** \n", + "\n", + "If you are on UC enabled clusters where mapInPandas does not work, you can try Dask. \n", + "\n", + "Dask uses the current node as a ```Local Cluster```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f89f3a33-5348-4d80-90ce-a6fe84c16306", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:llmfoundry.utils.validation_utils:With udf_iterable defined, it's up to the user's discretion to provide mds_kwargs[columns]'\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f/lib/python3.10/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\nPerhaps you already have a cluster running?\nHosting the HTTP server on port 39531 instead\n warnings.warn(\nWARNING:streaming.base.storage.upload:Directory /Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3 exists and not empty. But continue to mkdir since exist_ok is set to be True.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "A temporary folder /tmp/tmpp2gj2trw is created to store index files\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(('/Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3', ''),\n", + " 0)" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dask.bag as db\n", + "\n", + "input_folder = FT_API_args.train_data_path\n", + "pattern = input_folder + '/*.txt'\n", + "b = db.read_text(pattern, linedelimiter='\\n', blocksize='128MiB')\n", + "df = b.to_dataframe(columns = ['text'])\n", + "df = df[df.text != '\\n']\n", + "\n", + "mds_kwargs = {\n", + " 'out': temporary_mds_output_path,\n", + " 'columns': {\n", + " 'tokens': 'bytes'\n", + " },\n", + " 'keep_local': True, \n", + "}\n", + "udf_kwargs = {\n", + " 'concat_tokens': FT_API_args.context_length,\n", + " 'tokenizer': FT_API_args.model, \n", + " 'eos_text': '',\n", + " 'compression': 'zstd',\n", + " 'no_wrap': False,\n", + " 'bos_text': '',\n", + "}\n", + "dataframe_to_mds(df,\n", + " merge_index=True,\n", + " mds_kwargs=mds_kwargs,\n", + " udf_iterable=pandas_processing_fn,\n", + " udf_kwargs=udf_kwargs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb27026e-5f1e-453f-983d-8909f8999892", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Validation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ef494943-791e-44c1-87f3-92e022eb480a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "We perform integrity checks on MDS dataset\n", + "- number of shards match with index.json. \n", + "- Inspect first 5 examples by decode the tokens back to texts" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d592bc94-c374-493a-9a30-6f2b9203a6d0", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:streaming.base.storage.upload:Directory /Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3 exists and not empty. But continue to mkdir since exist_ok is set to be True.\nWARNING:streaming.base.storage.upload:Directory /Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3 exists and not empty. But continue to mkdir since exist_ok is set to be True.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Num examples: 456\nFirst example:\nITEM 1. BUSINESS GENERAL DEVELOPMENT OF BUSINESS Abbott Laboratories is an Illinois corporation, incorporated in 1900. The Company's* principal business is the discovery, development, manufacture, and sale of a broad and diversified line of health care products and services. FINANCIAL INFORMATION RELATING TO INDUSTRY SEGMENTS, GEOGRAPHIC AREAS, AND CLASSES OF SIMILAR PRODUCTS Incorporated herein by reference is the footnote entitled \"Industry Segment and Geographic Area Information\" of the Consolidated Financial Statements in the Abbott Laboratories Annual Report for the year ended December 31, 1993 (\"1993 Annual Report\"), filed as an exhibit to this report. Also incorporated herein by reference is the text and table of sales by class of similar products included in the section of the 1993 Annual Report captioned \"Financial Review.\" NARRATIVE DESCRIPTION OF BUSINESS PHARMACEUTICAL AND NUTRITIONAL PRODUCTS Included in this segment is a broad line of adult and pediatric pharmaceuticals and nutritionals. These products are sold primarily on the prescription or recommendation of physicians or other health care professionals. The segment also includes agricultural and chemical products, bulk pharmaceuticals, and consumer products. Principal pharmaceutical and nutritional products include the anti-infectives clarithromycin, sold in the United States under the trademark Biaxin-R- and outside the United States primarily under the trademark Klacid-R- and tosufloxacin, sold in Japan under the trademark Tosuxacin-TM-; various forms of the antibiotic erythromycin, sold primarily as PCE-R- or polymer-coated erythromycin, Erythrocin-R-, and E.E.S.-R-; agents for the treatment of epilepsy, including Depakote-R-; a broad line of cardiovascular products, including Loftyl-R-, a vasoactive agent sold outside the United States; Hytrin-R-, used as an anti-hypertensive and for the treatment of benign prostatic hyperplasia; Abbokinase-R-, a thrombolytic drug; Survanta-R-, a bovine derived lung surfactant; various forms of prepared infant formula, including Similac-R-, Isomil-R-, and Alimentum-R-; and other medical and pediatric nutritionals, including Ensure-R-, Ensure Plus-R-, Jevity-R-, Glucerna-R-, Advera-TM-, PediaSure-R-, Pedialyte-R- and Gain-R-. Consumer products include the dandruff shampoo Selsun Blue-R-; Murine-R- eye care and ear care products; Tronolane-R- hemorrhoid medication; and Faultless-R- rubber sundry products. Agricultural and chemical products include plant growth regulators, including ProGibb-R-; herbicides; larvicides, including Vectobac-R-; and biologically derived insecticides, including DiPel-R- and XenTari-R-. Pharmaceutical and nutritional products are generally sold directly to retailers, wholesalers, health care facilities, and government agencies. In most cases, they are distributed from Company-owned distribution centers or public warehouses. Certain products are co-marketed with other companies. In certain overseas countries, some of these products are marketed and distributed through distributors. Primary marketing efforts for pharmaceutical and nutritional products are directed toward securing the prescription or recommendation of the Company's brand of products by physicians or other health care professionals. Managed care purchasers, for example health maintenance organizations (HMOs) and pharmacy benefit managers, are becoming increasingly important customers. Competition is generally from other broad line and specialized health care manufacturers. A significant aspect of competition is the search for technological innovations. The - ------------------------ * As used throughout the text of this Report, the term \"Company\" refers to Abbott Laboratories, an Illinois corporation, or Abbott Laboratories and its consolidated subsidiaries, as the context requires. introduction of new products by competitors and changes in medical practices and procedures can result in product obsolescence. In addition, the substitution of generic drugs for the brand prescribed has increased competitive pressures on pharmaceutical products. Consumer products are promoted directly to the public by consumer advertising. These products are generally sold directly to retailers and wholesalers. Competitive products are sold by other diversified consumer and health care companies. Competitive factors include consumer advertising, scientific innovation, price, and availability of generic product forms. Agricultural and chemical products are generally sold to agricultural distributors and pharmaceutical companies. Competition is primarily from large chemical and agricultural companies and companies selling specialized agricultural products. Competition is based on numerous factors depending on the market served. Important competitive factors include product performance for specialized industrial and agricultural uses, price, and technological advantages. The Company is the leading worldwide producer of the antibiotic erythromycin. Similac-R- is the leading infant formula product in the United States. Under an agreement between the Company and Takeda Chemical Industries, Ltd. of Japan (Takeda), TAP Pharmaceuticals Inc. (TAP), owned 50 percent by the Company and 50 percent by Takeda, develops and markets in the United States products based on Takeda research. TAP markets Lupron-R-, an LH-RH analog, and Lupron Depot-R-, a sustained release form of Lupron-R-, in the United States. These agents are used for the treatment of advanced prostatic cancer, endometriosis, and central precocious puberty. The Company also has marketing rights to certain Takeda products in select Latin American markets. The Company also markets Lupron-R-, Lupron Depot-R-, and Lupron Depot-Ped-R- in select markets outside the United States. HOSPITAL AND LABORATORY PRODUCTS Hospital and laboratory products include diagnostic systems for blood banks, hospitals, commercial laboratories, and alternate-care testing sites; intravenous and irrigation fluids and related administration equipment, including electronic drug delivery systems; drugs and drug delivery systems; anesthetics; critical care products; and other medical specialty products for hospitals and alternate-care sites. The principal products included in this segment are parenteral (intravenous or I.V.) solutions and related administration equipment sold as the LifeCare-R- line of products, LifeShield-R- sets, and Venoset-R- products; irrigating fluids; parenteral nutritionals such as Aminosyn-R- and Liposyn-R-; Plum-R- and Omni-Flow-R- electronic drug delivery systems; Abbott Pain Management Provider-R-; patient-controlled analgesia (PCA) systems; venipuncture products; hospital injectables; premixed I.V. drugs in various containers; ADD-Vantage-R- and Nutrimix-R- drug and nutritional delivery systems; anesthetics, including Pentothal-R-, isoflurane, and enflurane; hemodynamic monitoring equipment; Calcijex-R-, an injectable agent for treatment of bone disease in hemodialysis patients; critical care products including Opticath-R-; screening tests for hepatitis B, HTLV-1, hepatitis B core, and hepatitis C; tests for detection of AIDS antibodies and antigens, and other infectious disease detection systems; tests for determining levels of abused drugs with the ADx-R- instrument; physiological diagnostic tests; cancer monitoring tests including tests for prostate specific antigen; laboratory tests and therapeutic drug monitoring systems such as TDx-R-; clinical chemistry systems such as Abbott Spectrum-R-, Abbott Spectrum-R- EPx-R-, Abbott Spectrum-R- CCx-TM-, and Quantum-TM-; Commander-R- and IMx-R- lines of diagnostic instruments and chemical reagents used with immunoassay diagnostics; Abbott Vision-R-, a desk-top blood analyzer, the Abbott TestPack-R- system for diagnostic testing, and a full line of hematology systems and reagents known as the Cell-Dyn-R- series. The hospital and laboratory products the Company expects to introduce in the United States in 1994 include: AxSym-TM-, a diagnostic system; Abbott Maestro-TM-, a data management system; and EnCounter-R-, a desktop hematology analyzer. The Company markets hospital and laboratory products in the United States and many other countries. These products are generally distributed to wholesalers and directly to hospitals, laboratories, and physicians' offices from distribution centers maintained by the Company. Sales are also made in the home infusion services market directly to patients receiving treatment outside the hospital through marketing arrangements with hospitals and other health care providers. Overseas sales are made either directly to customers or through distributors, depending on the market served. The hospital and laboratory products industry segment is highly competitive, both in the United States and overseas. This segment is subject to competition in technological innovation, price, convenience of use, service, instrument warranty provisions, product performance, long-term supply contracts, and product potential for overall cost effectiveness and productivity gains. Products in this segment can be subject to rapid product obsolescence. The Company has benefitted from technological advantages of certain of its current products; however, these advantages may be reduced or eliminated as competitors introduce new products. The Company is one of the leading domestic manufacturers of I.V. and irrigating solutions and related administration equipment, parenteral nutritional products, anesthesia products, and drug delivery systems. It is also the worldwide leader in in vitro diagnostic products, including thyroid tests, therapeutic drug monitoring, cancer monitoring tests, diagnostic tests for the detection of hepatitis and AIDS antibodies, and immunodiagnostic instruments. INFORMATION WITH RESPECT TO THE COMPANY'S BUSINESS IN GENERAL SOURCES AND AVAILABILITY OF RAW MATERIALS The Company purchases, in the ordinary course of business, necessary raw materials and supplies essential to the Company's operations from numerous suppliers in the United States and overseas. There have been no recent availability problems or significant supply shortages. PATENTS, TRADEMARKS, AND LICENSES The Company is aware of the desirability for patent and trademark protection for its products. The Company owns, has applications pending for, and is licensed under a substantial number of patents. Accordingly, where possible, patents and trademarks are sought and obtained for the Company's products in the United States and all countries of major marketing interest to the Company. Principal trademarks and the products they cover are discussed in the Narrative Description of Business on pages 1 and 2. These, and various patents which expire during the period 1994 to 2011, in the aggregate, are believed to be of material importance in the operation of the Company's business. However, the Company believes that no single patent, license, trademark, (or related group of patents, licenses, or trademarks) is material in relation to the Company's business as a whole. SEASONAL ASPECTS, CUSTOMERS, BACKLOG, AND RENEGOTIATION There are no significant seasonal aspects to the Company's business. The incidence of certain infectious diseases which occur at various times in different areas of the world does, however, affect the demand for the Company's anti-infective products. Orders for the Company's products are generally filled on a current basis, and order backlog is not material to the Company's business. No single customer accounted for sales equaling 10 percent or more of the Company's consolidated net sales. No material portion of the Company's business is subject to renegotiation of profits or termination of contracts at the election of the government. RESEARCH AND DEVELOPMENT The Company spent $880,974,000 in 1993, $772,407,000 in 1992, and $666,336,000 in 1991 on research to discover and develop new products and processes and to improve existing products and processes. The Company continues to concentrate research expenditures in pharmaceutical and diagnostic products. ENVIRONMENTAL MATTERS The Company believes that its operations comply in all material respects with applicable laws and regulations concerning environmental protection. Regulations under federal and state environmental laws impose stringent limitations on emissions and discharges to the environment from various manufacturing operations. The Company's capital and operating expenditures for pollution control in 1993 were approximately $32 million and $31 million, respectively. Capital and operating expenditures for pollution control are estimated to approximate $39 million and $36 million, respectively, in 1994. The Company is participating as one of many potentially responsible parties in investigation and/ or remediation at eight locations in the United States and Puerto Rico under the Comprehensive Environmental Response, Compensation, and Liability Act, commonly known as Superfund. The aggregate costs of remediation at these sites by all identified parties are uncertain but have been subject to widely ranging estimates totaling as much as several hundred million dollars. In many cases, the Company believes that the actual costs will be lower than these estimates, and the fraction for which the Company may be responsible is anticipated to be considerably less and will be paid out over a number of years. The Company expects to participate in the investigation or cleanup at these sites. The Company is also voluntarily investigating potential contamination at five Company-owned sites, and has initiated voluntary remediation at four Company-owned sites, in cooperation with the Environmental Protection Agency (EPA) or similar state agencies. While it is not feasible to predict with certainty the costs related to the previously described investigation and cleanup activities, the Company believes that such costs, together with other expenditures to maintain compliance with applicable laws and regulations concerning environmental protection, should not have a material adverse effect on the Company's earnings or competitive position. EMPLOYEES The Company employed 49,659 persons as of December 31, 1993. REGULATION The development, manufacture, sale, and distribution of the Company's products are subject to comprehensive government regulation, and the general trend is toward more stringent regulation. Government regulation by various federal, state, and local agencies, which includes detailed inspection of and controls over research and laboratory procedures, clinical investigations, and manufacturing, marketing, sampling, distribution, recordkeeping, storage and disposal practices, substantially increases the time, difficulty, and costs incurred in obtaining and maintaining the approval to market newly developed and existing products. Government regulatory actions can result in the seizure or recall of products, suspension or revocation of the authority necessary for their production and sale, and other civil or criminal sanctions. Continuing studies of the utilization, safety, and efficacy of health care products and their components are being conducted by industry, government agencies, and others. Such studies, which employ increasingly sophisticated methods and techniques, can call into question the utilization, safety, and efficacy of previously marketed products and in some cases have resulted, and may in the future result, in the discontinuance of marketing of such products and give rise to claims for damages from persons who believe they have been injured as a result of their use. The cost of human health care products continues to be a subject of investigation and action by governmental agencies, legislative bodies, and private organizations in the United States and other countries. In the United States, most states have enacted generic substitution legislation requiring or permitting a dispensing pharmacist to substitute a different manufacturer's version of a pharmaceutical product for the one prescribed. Federal and state governments continue to press efforts to reduce costs of Medicare and Medicaid programs, including restrictions on amounts agencies will reimburse for the use of products. Manufacturers must pay certain statutorily-prescribed rebates on Medicaid purchases for reimbursement on prescription drugs under state Medicaid plans. In addition, the Federal government follows a diagnosis-related group (DRG) payment system for certain institutional services provided under Medicare or Medicaid. The DRG system entitles a health care facility to a fixed reimbursement based on discharge diagnoses rather than actual costs incurred in patient treatment, thereby increasing the incentive for the facility to limit or control expenditures for many health care products. The Veterans Health Care Act of 1992 requires manufacturers to extend additional discounts on pharmaceutical products to various federal agencies, including the Department of Veterans Affairs, Department of Defense, and Public Health Service entities and institutions. In the United States, governmental cost-containment efforts have extended to the federally subsidized Special Supplemental Food Program for Women, Infants, and Children (WIC). All states participate in WIC and have sought and obtained rebates from manufacturers of infant formula whose products are used in the program. All of the states have also conducted competitive bidding for infant formula contracts which require the use of specific infant formula products for the state WIC program. The Child Nutrition and WIC Reauthorization Act of 1989 requires all states participating in WIC to engage in competitive bidding upon the expiration of their existing infant formula contracts. Governmental regulatory agencies now require manufacturers to pay additional fees. Under the Prescription Drug User Fee Act of 1992, the Federal Food and Drug Administration imposes substantial fees on various aspects of the approval, manufacture and sale of prescription drugs. Congress is now considering expanding user fees to medical devices. The Company believes that such legislation, if enacted, will add considerable expense for the Company. In the United States comprehensive legislation has been proposed that would make significant changes to the availability, delivery and payment for healthcare products and services. It is the intent of such proposed legislation to provide health and medical insurance for all United States citizens and to reduce the rate of increases in United States healthcare expenditures. If such legislation is enacted, the Company believes it could have the effect of reducing prices for, or reducing the rate of price increases for health and medical insurance and medical products and services. International operations are also subject to a significant degree of government regulation. Many countries, directly or indirectly through reimbursement limitations, control the selling price of most health care products. Furthermore, many developing countries limit the importation of raw materials and finished products. International regulations are having an impact on United States regulations, as well. The International Organization for Standardization (\"ISO\") provides the voluntary criteria for regulating medical devices within the European Economic Community. The Food and Drug Administration (\"FDA\") has announced that it will attempt to harmonize its regulation of medical devices with that of the ISO. Recently published changes to the FDA's regulations governing the manufacture of medical devices appear to encompass and exceed the ISO's approach to regulating medical devices. The FDA's adoption of the ISO's approach to regulation and other changes to the manner in which the FDA regulates medical devices will increase the cost of compliance with those regulations. Efforts to reduce health care costs are also being made in the private sector. Health care providers have responded by instituting various cost reduction and containment measures. It is not possible to predict the extent to which the Company or the health care industry in general might be affected by the matters discussed above. INTERNATIONAL OPERATIONS The Company markets products in approximately 130 countries through affiliates and distributors. Most of the products discussed in the preceding sections of this report are sold outside the United States. In addition, certain products of a local nature and variations of product lines to meet local regulatory requirements and marketing preferences are manufactured and marketed to customers outside the United States. International operations are subject to certain additional risks inherent in conducting business outside the United States, including price and currency exchange controls, changes in currency exchange rates, limitations on foreign participation in local enterprises, expropriation, nationalization, and other governmental action. ITEM 2.\n\n\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "/local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f/lib/python3.10/site-packages/streaming/base/dataset.py:397: UserWarning: Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).\n warnings.warn(f'Because `predownload` was not specified, it will default to ' +\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "ITEM 1. BUSINESS GENERAL DEVELOPMENT OF BUSINESS Abbott Laboratories is an Illinois corporation, incorporated in 1900. Abbott's* principal business is the discovery, development, manufacture, and sale of a broad and diversified line of health care products. FINANCIAL INFORMATION RELATING TO INDUSTRY SEGMENTS, GEOGRAPHIC AREAS, AND CLASSES OF SIMILAR PRODUCTS Incorporated herein by reference is Note 6 entitled \"Segment and Geographic Area Information\" of the Notes to Consolidated Financial Statements included under Item 8, \"Financial Statements and Supplementary Data\" and the sales information related to HUMIRA® included in \"Financial Review.\" NARRATIVE DESCRIPTION OF BUSINESS Through December 31, 2012, Abbott had five reportable revenue segments: Proprietary Pharmaceutical Products, Established Pharmaceutical Products, Diagnostic Products, Nutritional Products, and Vascular Products. On January 1, 2013, Abbott completed the separation of its research-based pharmaceuticals business through the distribution of the issued and outstanding common stock of AbbVie Inc. (AbbVie) to Abbott's shareholders. AbbVie was formed to hold Abbott's research-based pharmaceuticals business and, as a result of the distribution, is now an independent public company trading under the symbol \"ABBV\" on the New York Stock Exchange. *As used throughout the text of this report on Form 10-K, the term \"Abbott\" refers to Abbott Laboratories, an Illinois corporation, or Abbott Laboratories and its consolidated subsidiaries, as the context requires. Proprietary Pharmaceutical Products These products include a broad line of adult and pediatric pharmaceuticals manufactured, marketed, and sold worldwide (except as noted) and are generally sold directly to wholesalers, distributors, government agencies, health care facilities, specialty pharmacies, and independent retailers from distribution centers and public warehouses. Outside the United States, sales are made either directly to customers or through distributors, depending on the market served. Certain products are co-marketed or co-promoted with other companies. As a result of the separation of Abbott's research-based pharmaceuticals business, beginning in 2013, Abbott will no longer have a Proprietary Pharmaceutical Products segment. The principal products included in the Proprietary Pharmaceutical Products segment are: •HUMIRA®, for the treatment of rheumatoid arthritis, psoriatic arthritis, ankylosing spondylitis, psoriasis, juvenile idiopathic arthritis, and Crohn's disease as well as ulcerative colitis in the United States and European Union and axial spondyloarthritis and pediatric Crohn's disease in the European Union; •Kaletra®, also marketed as Aluvia®, and Norvir® for the treatment of HIV infection; •Lupron®, also marketed as Lucrin®, used for the palliative treatment of advanced prostate cancer, treatment of endometriosis and central precocious puberty, and for the preoperative treatment of patients with anemia caused by uterine fibroids; •Synagis®, for the prevention of respiratory syncytial virus (RSV); •AndroGel®, for the treatment of adult males who have low testosterone (marketed and sold in the United States); •the anesthesia product sevoflurane (sold under the trademarks Ultane® and Sevorane®); •Zemplar®, for the prevention and treatment of secondary hyperparathyroidism associated with Stage 3, 4, or 5 chronic kidney disease; •Synthroid®, for the treatment of hypothyroidism (marketed and sold in the United States); •Creon®, for the treatment of pancreatic exocrine insufficiency associated with several underlying conditions, including cystic fibrosis and chronic pancreatitis (marketed and sold in the United States); and •TriCor®, Trilipix®, Simcor®, and Niaspan®, for the treatment of dyslipidemia (marketed and sold in the United States). The Proprietary Pharmaceutical Products segment directs its primary marketing efforts toward securing the prescription, or recommendation, of its pharmaceutical products by physicians. Managed care providers, market access organizations (for example, health maintenance organizations and pharmacy benefit managers) and national and regional governments and agencies (for example, the United States Department of Veterans Affairs and the United States Department of Defense) are also important customers. Competition in the Proprietary Pharmaceutical Products segment is generally from other health care and pharmaceutical companies. The search for technological innovations in pharmaceutical products is a significant aspect of competition in this segment. The introduction of new products by competitors and changes in medical practices and procedures can result in product obsolescence in the Proprietary Pharmaceutical Products segment. Price can also be a factor. In addition, the substitution of generic drugs for the brand prescribed has increased competitive pressures on pharmaceutical products that do not have patent protection. Established Pharmaceutical Products These products include a broad line of branded generic pharmaceuticals manufactured worldwide and marketed and sold outside the United States, and are generally sold directly to wholesalers, distributors, government agencies, health care facilities, specialty pharmacies, and independent retailers from Abbott-owned distribution centers and public warehouses, depending on the market served. Certain products are co-marketed or co-promoted with other companies. The principal products included in the Established Pharmaceutical Products segment are: •Creon®, for the treatment of pancreatic exocrine insufficiency associated with several underlying conditions, including cystic fibrosis and chronic pancreatitis (marketed and sold outside the United States); •the anti-infective clarithromycin (sold under the trademarks Biaxin®, Klacid®, and Klaricid®); •Influvac®, an influenza vaccine available during flu season; •Serc®, for the treatment of Ménière's disease and vestibular vertigo; •Brufen®, for the treatment of pain, fever and inflammation; •Synthroid®, for the treatment of hypothyroidism (marketed and sold outside the United States); •Duspatal® and Dicetel®, for the treatment of irritable bowel syndrome or biliary spasm; •Duphaston®, for the treatment of many different gynecological disorders; •Adomet®, Heptral®, Transmetil®, Samyr®, and Donamet®, for the treatment of intrahepatic cholestasis (associated with liver disease) or depressive symptoms; •Duphalac®, for regulation of the physiological rhythm of the colon; •Lipanthyl® and TriCor®, for the treatment of dyslipidemia (marketed and sold outside the United States); and •Teveten® and Teveten® Plus, for the treatment of essential hypertension, and Physiotens®, for the treatment of hypertension. The Established Pharmaceutical Products segment directs its primary marketing efforts toward securing the prescription, or recommendation, of Abbott's brand of products by physicians both in the primary care and secondary (hospital) care environment. Government agencies are also important customers. Competition in the Established Pharmaceutical Products segment is generally from other health care and pharmaceutical companies. Changes to government tenders and reimbursement schemes are significant factors with respect to pricing. In addition, the substitution of generic drugs for the brand prescribed and introduction of additional forms of already marketed established products by generic or branded competitors have increased competitive pressures. Diagnostic Products These products include a broad line of diagnostic systems and tests manufactured, marketed, and sold worldwide to blood banks, hospitals, commercial laboratories, clinics, physicians' offices, government agencies, alternate-care testing sites, and plasma protein therapeutic companies. The segment's products are generally marketed and sold directly from Abbott-owned distribution centers, public warehouses and third-party distributors. Outside the United States, sales are made either directly to customers or through distributors, depending on the market served. The principal products included in the Diagnostic Products segment are: •immunoassay and clinical chemistry systems, including ARCHITECT® and ABBOTT PRISM®; •assays used for screening and/or diagnosis for drugs of abuse, cancer, therapeutic drug monitoring, fertility, physiological diseases, and infectious diseases such as hepatitis and HIV; •the m2000™, an instrument that automates the extraction, purification, and preparation of DNA and RNA from patient samples, and detects and measures infectious agents including HIV, HBV, HCV, HPV, and CT/NG; •the Vysis® product line of genomic-based tests, including the PathVysion® HER-2 DNA probe kit; the UroVysion® bladder cancer recurrence kit; and the Vysis ALK Break Apart FISH Probe Kit, the only FDA-approved companion diagnostic to Pfizer's approved non-small-cell lung cancer therapy XALKORI®; •informatics and automation solutions for use in the laboratory; •a full line of hematology systems and reagents known as the Cell-Dyn® series; and •the i-STAT® point-of-care diagnostic systems and tests for blood analysis. In addition, under a distribution agreement with Celera Group, the Diagnostic Products segment exclusively distributes certain Celera molecular diagnostic products, including the ViroSeq® HIV genotyping system and products used for the detection of mutations in the CFTR gene, which causes cystic fibrosis. The Diagnostic Products segment's products are subject to competition in technological innovation, price, convenience of use, service, instrument warranty provisions, product performance, long-term supply contracts, and product potential for overall cost-effectiveness and productivity gains. Some products in this segment can be subject to rapid product obsolescence or regulatory changes. Although Abbott has benefited from technological advantages of certain of its current products, these advantages may be reduced or eliminated as competitors introduce new products. Nutritional Products These products include a broad line of pediatric and adult nutritional products manufactured, marketed, and sold worldwide. These products are generally marketed and sold directly to customers and to institutions, wholesalers, retailers, health care facilities, government agencies, and third\n\n-party distributors from Abbott-owned distribution centers or third-party distributors. The principal products included in the Nutritional Products segment are: •various forms of prepared infant formula and follow-on formula, including Similac®Advance®, Similac® Advance® with EarlyShield®, Similac®, Similac® with Iron, Similac Sensitive®, Similac Sensitive® RS, Similac Go&Grow®, Similac® NeoSure®, Similac® Organic, Similac Special Care®, Similac® Total Comfort®, Isomil® Advance®, Isomil®, Alimentum®, Gain®, and Grow®; •adult and other pediatric nutritional products, including Ensure®, Ensure Plus®, Ensure® Muscle Health, Ensure® (with Nutrivigor®), Glucerna®, Glucerna® Hunger Smart®, ProSure®, PediaSure®, PediaSure Sidekicks®, EleCare®, Juven®, Abound®, and Pedialyte®; •nutritional products used in enteral feeding in health care institutions, including Jevity®, Glucerna® 1.2 Cal, Glucerna® 1.5 Cal, Osmolite®, Oxepa®, Freego (Enteral Pump) and Freego® sets, and Nepro®; and •Zone Perfect® bars and the EAS® family of nutritional brands, including Myoplex® and AdvantEdge®. Primary marketing efforts for nutritional products are directed toward securing the recommendation of Abbott's brand of products by physicians or other health care professionals. In addition, certain nutritional products sold as Gain™, Grow™, PediaSure®, PediaSure Sidekicks®, Pedialyte®, Ensure®, Zone Perfect®, EAS®/Myoplex®, and Glucerna® are also promoted directly to the public by consumer marketing efforts in select markets. Competition for nutritional products in the segment is generally from other diversified consumer and health care manufacturers. Competitive factors include consumer advertising, formulation, packaging, scientific innovation, intellectual property, price, and availability of product forms. A significant aspect of competition is the search for ingredient innovations. The introduction of new products by competitors, changes in medical practices and procedures, and regulatory changes can result in product obsolescence. In addition, private label and local manufacturers' products may increase competitive pressure. Vascular Products These products include a broad line of coronary, endovascular, vessel closure, and structural heart devices for the treatment of vascular disease manufactured, marketed and sold worldwide. The segment's products are generally marketed and sold directly to hospitals from Abbott-owned distribution centers and public warehouses. Outside the United States, sales are made either directly to customers or through distributors, depending on the market served. The principal products included in the Vascular Products segment are: •Xience Xpedition®, Xience Prime®, Xience nano™, and Xience V®, drug-eluting coronary stent systems developed on the Multi-Link Vision® platform; •Absorb®, a drug-eluting coronary bioresorbable vascular scaffold; •Multi-Link 8®, Multi-Link Vision® and Multi-Link Mini Vision®, coronary metallic stents; •TREK® and Voyager®, coronary balloon dilatation products; •Hi-Torque Balance Middleweight Elite® and ASAHI® coronary guidewires (licensed from Asahi Intecc Co., Ltd.); •StarClose® and Perclose® vessel closure devices; •Acculink®/Accunet® and Xact®/Emboshield NAV6®, carotid stent systems; •Armada® and Absolute Pro Peripheral® balloon dilatation products; •Herculink Elite Renal® and Omnilink Elite Iliac® stent systems; and •MitraClip®, a percutaneous valve repair system. The Vascular Products segment's products are subject to competition in technological innovation, price, convenience of use, service, product performance, long-term supply contracts, and product potential for overall cost-effectiveness and productivity gains. Some products in this segment can be subject to rapid product obsolescence or regulatory changes. Although Abbott has benefited from technological advantages of certain of its current products, these advantages may be reduced or eliminated as competitors introduce new products. Other Products The principal products in Abbott's other businesses include blood glucose monitoring meters, test strips, data management software and accessories for people with diabetes, including the FreeStyle® product line, and medical devices for the eye, including cataract surgery, LASIK surgery, contact lens care products, and dry eye products. These products are mostly marketed worldwide and generally sold directly to wholesalers, government agencies, health care facilities, mail order pharmacies, and independent retailers from Abbott-owned distribution centers and public warehouses. Some of these products are marketed and distributed through distributors. Blood glucose monitoring meters, contact lens care products, and dry eye products are also marketed and sold over-the-counter to consumers. These products are subject to competition in technological innovation, price, convenience of use, service, and product performance. Medical devices for the eye also can be subject to rapid product obsolescence or regulatory changes. INFORMATION WITH RESPECT TO ABBOTT'S BUSINESS IN GENERAL Sources and Availability of Raw Materials Abbott purchases, in the ordinary course of business, raw materials and supplies essential to Abbott's operations from numerous suppliers in the United States and abroad. There have been no recent significant availability problems or supply shortages. Patents, Trademarks, and Licenses Abbott is aware of the desirability for patent and trademark protection for its products. Accordingly, where possible, patents and trademarks are sought and obtained for Abbott's products in the United States and all countries of major marketing interest to Abbott. Abbott owns and is licensed under a substantial number of patents and patent applications. Principal trademarks and the products they cover are discussed in the Narrative Description of Business on pages 1 through 5. These, and various patents which expire during the period 2013 to 2032, in the aggregate, are believed to be of material importance in the operation of Abbott's business. Abbott believes that, after the separation of AbbVie, no single patent, license, or trademark is material in relation to Abbott's business as a whole. In connection with the separation and distribution of AbbVie, Abbott contributed certain pharmaceutical related patents, licenses, and trademarks to AbbVie. Patent-related litigation is discussed in Legal Proceedings on pages 18 through 20. Seasonal Aspects, Customers, Backlog, and Renegotiation There are no significant seasonal aspects to Abbott's business. Abbott has no single customer that, if the customer were lost, would have a material adverse effect on Abbott. Orders for Abbott's products are generally filled on a current basis, and order backlog is not material to Abbott's business. No material portion of Abbott's business is subject to renegotiation of profits or termination of contracts at the election of the government. Research and Development Abbott spent approximately $4.3 billion in 2012, $4.1 billion in 2011, and $3.7 billion in 2010, on research to discover and develop new products and processes and to improve existing products and processes. The majority of research and development expenditures was concentrated on proprietary pharmaceutical products. Environmental Matters Abbott believes that its operations comply in all material respects with applicable laws and regulations concerning environmental protection. Regulations under federal and state environmental laws impose stringent limitations on emissions and discharges to the environment from various manufacturing operations. Abbott's capital and operating expenditures for pollution control in 2012 were approximately $12 million and $63 million, respectively. After the separation of AbbVie, capital and operating expenditures for pollution control in 2013 are estimated to be $10 million and $53 million, respectively. Abbott has been identified as one of many potentially responsible parties in investigations and/or remediations at several locations in the United States, including Puerto Rico, under the Comprehensive Environmental Response, Compensation, and Liability Act, commonly known as Superfund. Abbott is also engaged in remediation at several other sites, some of which are owned by Abbott, in cooperation with the Environmental Protection Agency (EPA) or similar agencies. While it is not feasible to predict with certainty the final costs related to those investigations and remediation activities, Abbott believes that such costs, together with other expenditures to maintain compliance with applicable laws and regulations concerning environmental protection, should not have a material adverse effect on Abbott's financial position, cash flows, or results of operations. Employees Abbott employed approximately 91,000 persons as of December 31, 2012. Approximately 21,000 persons were transferred to AbbVie in connection with the separation. Regulation The development, manufacture, marketing, sale, promotion, and distribution of Abbott's products are subject to comprehensive government regulation by the U.S. Food and Drug Administration and similar international regulatory agencies. Government regulation by various international, supranational, federal and state agencies, both domestic and international, addresses (among other matters) the development and approval to market Abbott's products, as well as the inspection of, and controls over, research and laboratory procedures, clinical investigations, product approvals and manufacturing, labeling, packaging, supply chains, marketing and promotion, pricing and reimbursement, sampling, distribution, quality control, post-market surveillance, record keeping, storage, and disposal practices. Abbott's international operations are also affected by trade regulations in many countries that limit the import of raw materials and finished products and by local and international laws and regulations that seek to prevent corruption and bribery in the marketplace (including the United States Foreign Corrupt Practices Act and the United Kingdom Bribery Act which provide among other things, guidance on corporate interactions with government officials). In addition, Abbott is subject to laws and regulations pertaining to health care fraud and abuse, including state and federal\n\n anti-kickback and false claims laws in the United States. Prescription drug, nutrition, and medical device manufacturers such as Abbott are also subject to taxes, as well as application, product, user, establishment, and other fees. Governmental agencies can also invalidate intellectual property rights and control the entrance of multi-source drugs for small molecule and generic biologic medicines. Compliance with these laws and regulations is costly and materially affects Abbott's business. Among other effects, health care regulations substantially increase the time, difficulty, and costs incurred in obtaining and maintaining approval to market newly developed and existing products. Abbott expects this regulatory environment will continue to require significant technical expertise and capital investment to ensure compliance. Failure to comply can delay the release of a new product or result in regulatory and enforcement actions, the seizure or recall of a product, the suspension or revocation of the authority necessary for a product's production and sale, and other civil or criminal sanctions, including fines and penalties. In addition to regulatory initiatives, Abbott's business can be affected by ongoing studies of the utilization, safety, efficacy, and outcomes of health care products and their components that are regularly conducted by industry participants, government agencies, and others. These studies can call into question the utilization, safety, and efficacy of previously marketed products. In some cases, these studies have resulted, and may in the future result, in the discontinuance of marketing of such products domestically or globally, and may give rise to claims for damages from persons who believe they have been injured as a result of their use. Access to human health care products continues to be a subject of investigation and action by governmental agencies, legislative bodies, and private organizations in the United States and other countries. A major focus is cost containment. Efforts to reduce health care costs are also being made in the private sector, notably by health care payors and providers, which have instituted various cost reduction and containment measures. Abbott expects insurers and providers to continue attempts to reduce the cost of health care products. Many countries control the price of health care products directly or indirectly, through reimbursement, payment, pricing, coverage limitations, or compulsory licensing, and are adopting laws and rules to govern the introduction of biosimilar products. Domestic and foreign budgetary pressures may also heighten the scope and severity of pricing pressures on Abbott's products for the foreseeable future. Specifically, U.S. federal laws requiring pharmaceutical manufacturers to pay certain statutorily-prescribed rebates to state Medicaid programs on prescription drugs reimbursed under state Medicaid plans, and the efforts by states to seek additional rebates, affect Abbott's proprietary pharmaceutical business. Similarly, the Veterans Health Care Act of 1992 requires manufacturers to extend additional discounts on pharmaceutical products to various federal agencies, including the Department of Veterans Affairs, Department of Defense, Public Health Service entities and institutions, as well as certain other covered entities. The Veterans Health Care Act also established the 340B drug discount program, which requires pharmaceutical manufacturers to provide products at reduced prices to designated health care facilities. In the United States, most states also have generic substitution legislation requiring or permitting a dispensing pharmacist to substitute a different manufacturer's version of a pharmaceutical product for the one prescribed. In addition, the federal government follows a diagnosis-related group (DRG) payment system for certain institutional services provided under Medicare or Medicaid and has implemented a prospective payment system (PPS) for services delivered in hospital outpatient, nursing home, and home health settings. DRG and PPS entitle a health care facility to a fixed reimbursement based on the diagnosis and/or procedure rather than actu\n\n*** WARNING: max output size exceeded, skipping output. ***\n\nall available on Abbott's investor relations website (www.abbottinvestor.com). ITEM 1A.\nITEM 1A. RISK FACTORS In addition to the other information in this report, the following risk factors should be considered before deciding to invest in any of Abbott's securities. Additional risks and uncertainties not presently known to Abbott, or risks Abbott currently considers immaterial, could also affect Abbott's actual results. Abbott's business, financial condition, results of operations, or prospects could be materially adversely affected by any of these risks. Abbott may acquire other businesses, license rights to technologies or products, form alliances, or dispose of or spin-off businesses, which could cause it to incur significant expenses and could negatively affect profitability. Abbott may pursue acquisitions, technology licensing arrangements, and strategic alliances, or dispose of or spin-off some of its businesses, as part of its business strategy. Abbott may not complete these transactions in a timely manner, on a cost-effective basis, or at all, and may not realize the expected benefits. If Abbott is successful in making an acquisition, the products and technologies that are acquired may not be successful or may require significantly greater resources and investments than originally anticipated. Abbott may not be able to integrate acquisitions successfully into its existing business and could incur or assume significant debt and unknown or contingent liabilities. Abbott could also experience negative effects on its reported results of operations from acquisition or disposition-related charges, amortization of expenses related to intangibles and charges for impairment of long-term assets. These effects could cause a deterioration of Abbott's credit rating and result in increased borrowing costs and interest expense. The expiration or loss of patent protection and\n\n licenses may affect Abbott's future revenues and operating income. Many of Abbott's businesses rely on patent and trademark and other intellectual property protection. Although most of the challenges to Abbott's intellectual property have come from other businesses, governments may also challenge intellectual property protections. To the extent Abbott's intellectual property is successfully challenged, invalidated, or circumvented or to the extent it does not allow Abbott to compete effectively, Abbott's business will suffer. To the extent that countries do not enforce Abbott's intellectual property rights or to the extent that countries require compulsory licensing of its intellectual property, Abbott's future revenues and operating income will be reduced. Abbott's patents and trademarks are described in greater detail in the section captioned \"Patents, Trademarks, and Licenses,\" and litigation regarding these patents is described in the section captioned \"Legal Proceedings.\" Competitors' intellectual property may prevent Abbott from selling its products or have a material adverse effect on Abbott's future profitability and financial condition. Competitors may claim that an Abbott product infringes upon their intellectual property. Resolving an intellectual property infringement claim can be costly and time consuming and may require Abbott to enter into license agreements. Abbott cannot guarantee that it would be able to obtain license agreements on commercially reasonable terms. A successful claim of patent or other intellectual property infringement could subject Abbott to significant damages or an injunction preventing the manufacture, sale or use of affected Abbott products. Any of these events could have a material adverse effect on Abbott's profitability and financial condition. Abbott is subject to cost containment efforts that could cause a reduction in future revenues and operating income. In the United States and other countries, Abbott's businesses have experienced downward pressure on product pricing. Cost containment efforts by governments and private organizations are described in greater detail in the section captioned \"Regulation.\" To the extent these cost containment efforts are not offset by greater patient access to health care or other factors, Abbott's future revenues and operating income will be reduced. Abbott is subject to numerous governmental regulations and it can be costly to comply with these regulations and to develop compliant products and processes. Abbott's products are subject to rigorous regulation by the U.S. Food and Drug Administration, and numerous international, supranational, federal, and state authorities. The process of obtaining regulatory approvals to market a drug or medical device can be costly and time-consuming, and approvals might not be granted for future products, or additional indications or uses of existing products, on a timely basis, if at all. Delays in the receipt of, or failure to obtain approvals for, future products, or new indications and uses, could result in delayed realization of product revenues, reduction in revenues, and in substantial additional costs. In addition, no assurance can be given that Abbott will remain in compliance with applicable FDA and other regulatory requirements once clearance or approval has been obtained for a product. These requirements include, among other things, regulations regarding manufacturing practices, product labeling, and advertising and postmarketing reporting, including adverse event reports and field alerts due to manufacturing quality concerns. Many of Abbott's facilities and procedures and those of Abbott's suppliers are subject to ongoing regulation, including periodic inspection by the FDA and other regulatory authorities. Abbott must incur expense and spend time and effort to ensure compliance with these complex regulations. Possible regulatory actions for non-compliance could include warning letters, fines, damages, injunctions, civil penalties, recalls, seizures of Abbott's products, and criminal prosecution. These actions could result in, among other things, substantial modifications to Abbott's business practices and operations; refunds, recalls, or seizures of Abbott's products; a total or partial shutdown of production in one or more of Abbott's facilities while Abbott or Abbott's suppliers remedy the alleged violation; the inability to obtain future pre-market clearances or approvals; and withdrawals or suspensions of current products from the market. Any of these events could disrupt Abbott's business and have a material adverse effect on Abbott's revenues, profitability and financial condition. Laws and regulations affecting government benefit programs could impose new obligations on Abbott, require Abbott to change its business practices, and restrict its operations in the future. Abbott's industry is also subject to various federal, state, and international laws and regulations pertaining to government benefit program reimbursement, price reporting and regulation, and health care fraud and abuse, including anti-kickback and false claims laws, and international and individual state laws relating to pricing and sales and marketing practices. Violations of these laws may be punishable by criminal and/or civil sanctions, including, in some instances, substantial fines, imprisonment, and exclusion from participation in federal and state health care programs, including Medicare, Medicaid, and Veterans Administration health programs. These laws and regulations are broad in scope and they are subject to evolving interpretations, which could require Abbott to incur substantial costs associated with compliance or to alter one or more of its sales or marketing practices. In addition, violations of these laws, or allegations of such violations, could disrupt Abbott's business and result in a material adverse effect on Abbott's revenues, profitability, and financial condition. Changes in the health care regulatory environment may adversely affect Abbott's business. A number of the provisions of the Patient Protection and Affordable Care Act and the Health Care and Education Reconciliation Act of 2010 require further rulemaking action by governmental agencies to implement. The laws change access to health care products and services and create new fees for the pharmaceutical and medical device industries. Future rulemaking could increase rebates, reduce prices or the rate of price increases for health care products and services, or require additional reporting and disclosure. Abbott cannot predict the timing or impact of any future rulemaking. Abbott's research and development efforts may not succeed in developing commercially successful products and technologies, which may cause Abbott's revenue and profitability to decline. To remain competitive, Abbott must continue to launch new products and technologies. To accomplish this, Abbott commits substantial efforts, funds, and other resources to research and development. A high rate of failure is inherent in the research and development of new products and technologies. Abbott must make ongoing substantial expenditures without any assurance that its efforts will be commercially successful. Failure can occur at any point in the process, including after significant funds have been invested. Promising new product candidates may fail to reach the market or may only have limited commercial success because of efficacy or safety concerns, failure to achieve positive clinical outcomes, inability to obtain necessary regulatory approvals, limited scope of approved uses, excessive costs to manufacture, the failure to establish or maintain intellectual property rights, or infringement of the intellectual property rights of others. Even if Abbott successfully develops new products or enhancements or new generations of Abbott's existing products, they may be quickly rendered obsolete by changing customer preferences, changing industry standards, or competitors' innovations. Innovations may not be accepted quickly in the marketplace because of, among other things, entrenched patterns of clinical practice or uncertainty over third-party reimbursement. Abbott cannot state with certainty when or whether any of its products under development will be launched, whether it will be able to develop, license, or otherwise acquire compounds or products, or whether any products will be commercially successful. Failure to launch successful new products or new indications for existing products may cause Abbott's products to become obsolete, causing Abbott's revenues and operating results to suffer. New products and technological advances by Abbott's competitors may negatively affect Abbott's results of operations. Abbott's products face intense competition from its competitors' products. Competitors' products may be safer, more effective, more effectively marketed or sold, or have lower prices or superior performance features than Abbott's products. Abbott cannot predict with certainty the timing or impact of the introduction of competitors' products. The manufacture of many of Abbott's products is a highly exacting and complex process, and if Abbott or one of its suppliers encounters problems manufacturing products, Abbott's business could suffer. The manufacture of many of Abbott's products is a highly exacting and complex process, due in part to strict regulatory requirements. Problems may arise during manufacturing for a variety of reasons, including equipment malfunction, failure to follow specific protocols and procedures, problems with raw materials, natural disasters, and environmental factors. In addition, single suppliers are currently used for certain products and materials. If problems arise during the production of a batch of product, that batch of product may have to be discarded. This could, among other things, lead to increased costs, lost revenue, damage to customer relations, time and expense spent investigating the cause and, depending on the cause, similar losses with respect to other batches or products. If problems are not discovered before the product is released to the market, recall and product liability costs may also be incurred. To the extent Abbott or one of its suppliers experiences significant manufacturing problems, this could have a material adverse effect on Abbott's revenues and profitability. Significant safety issues could arise for Abbott's products, which could have a material adverse effect on Abbott's revenues and financial condition. Health care products typically receive regulatory approval based on data obtained in controlled clinical trials of limited duration. Following regulatory approval, these products will be used over longer periods of time in many patients. Investigators may also conduct additional, and perhaps more extensive, studies. If new safety issues are reported, Abbott may be required to amend the conditions of use for a product. For example, Abbott may be required to provide additional warnings on a product's label or narrow its approved intended use, either of which could reduce the product's market acceptance. If serious safety issues arise with an Abbott product, sales of the product could be halted by Abbott or by regulatory authorities. Safety issues affecting suppliers' or competitors' products also may reduce the market acceptance of Abbott's products. In addition, in the ordinary course of business, Abbott is the subject of product liability claims and lawsuits alleging that its products or the products of other companies that Abbott promotes have resulted or could result in an unsafe condition for or injury to patients. Product liability claims and lawsuits and safety alerts or product recalls, regardless of their validity or ultimate outcome, may have a material adverse effect on Abbott's business and reputation and on Abbott's ability to attract and retain customers. Consequences\n\n may also include additional costs, a decrease in market share for the products, lower income or exposure to other claims. Product liability losses are self-insured. Product liability claims could have a material adverse effect on Abbott's profitability and financial condition. Further deterioration in the economic position and credit quality of certain European countries may negatively affect Abbott's results of operations. If economic conditions in certain European countries, including Greece, Portugal, Italy, and Spain, continue to worsen, the time it takes to collect outstanding trade receivables may increase. Financial instability and fiscal deficits in these countries may result in additional austerity measures to reduce costs, including health care. At the same time, ongoing sovereign debt issues, including the impact of credit downgrades, could increase Abbott's collection risk given that a significant amount of Abbott's receivables in these countries are with governmental health care systems. Abbott depends on sophisticated information technology systems to operate its business and a cyber attack or other breach of these systems could have a material adverse effect on Abbott's results of operations. Similar to other large multi-national companies, the size and complexity of Abbott's information technology systems makes them vulnerable to a cyber attack, malicious intrusion, breakdown, destruction, loss of data privacy, or other significant disruption. Abbott's systems have been and are expected to continue to be the target of malware and other cyber attacks. Abbott has invested in its systems and the protection of its data to reduce the risk of an invasion or interruption and monitors its systems on an ongoing basis for any current or potential threats. There can be no assurance that these measures and efforts will prevent future interruptions or breakdowns that could have a significant effect on Abbott's business. Abbott may incur operational difficulties or be exposed to claims and liabilities as a result of the separation. AbbVie and Abbott entered into a separation and distribution agreement and various other agreements to govern the separation of AbbVie from Abbott and the relationship between the two companies going forward. Certain of these agreements provide for the performance of services by each company for the benefit of the other for a period of time. If AbbVie is unable to satisfy its obligations under these agreements, including its indemnification obligations, Abbott could incur operational difficulties or losses. These arrangements could also lead to disputes between Abbott and AbbVie over Abbott's rights to certain shared property and rights and over the allocation of costs and revenues for products and operations. The separation and distribution agreement also provides for, among other things, indemnification obligations designed to make AbbVie financially responsible for substantially all liabilities that may exist relating to its business activities, whether incurred prior to or after AbbVie's separation from Abbott, as well as those obligations of Abbott assumed by AbbVie pursuant to the separation and distribution agreement. It is possible that a court would disregard the allocation agreed to between Abbott and AbbVie and require Abbott to assume responsibility for obligations allocated to AbbVie. Third parties could also seek to hold Abbott responsible for any of these liabilities or obligations. The indemnity rights Abbott has under the separation agreement may not be sufficient to protect Abbott. Even if Abbott is successful in obtaining indemnification, Abbott may have to bear losses temporarily. In addition, Abbott's indemnity obligations to AbbVie may be significant. These risks could negatively affect Abbott's results of operations. There could be significant liability if the distribution of AbbVie common stock to Abbott shareholders is determined to be a taxable transaction. Abbott received a private letter ruling from the Internal Revenue Service (IRS) to the effect that, among other things, the separation and the distribution of AbbVie qualifies as a transaction that is tax-free for U.S. federal income tax purposes under Sections 355 and 368(a)(1)(D) of the Internal Revenue Code (the Code). In addition, Abbott received an opinion from outside tax counsel to the effect that the separation and distribution qualifies as a transaction that is described in Sections 355(a) and 368(a)(1)(D) of the Code. The ruling and the opinion rely on certain facts, assumptions, representations and undertakings from Abbott and AbbVie regarding the past and future conduct of the companies' respective businesses and other matters. If any of these facts, assumptions, representations or undertakings are incorrect or not satisfied, Abbott and its shareholders may not be able to rely on the ruling or the opinion of tax counsel and could be subject to significant tax liabilities. Notwithstanding the receipt by Abbott of the private letter ruling from the IRS and opinion of tax counsel, the IRS could determine on audit that the separation is taxable if it determines that any of these facts, assumptions, representations or undertakings are not correct or have been violated or if it disagrees with the conclusions in the opinion that are not covered by the private letter ruling, or for other reasons, including as a result of certain significant changes in the share ownership of Abbott or AbbVie after the separation. If the separation is determined to be taxable for U.S. federal income tax purposes, Abbott and its shareholders that are subject to U.S. federal income tax could incur significant U.S. federal income tax liabilities. The international nature of Abbott's business subjects it to additional business risks that may cause its revenue and profitability to decline. Abbott's business is subject to risks associated with doing business internationally. Following the separation of AbbVie, sales outside of the United States are expected to make up approximately 70 percent of Abbott's net sales. The risks associated with Abbott's operations outside the United States include: •fluctuations in currency exchange rates; •changes in medical reimbursement policies and programs; •multiple regulatory requirements that are subject to change and that could restrict Abbott's ability to manufacture, market, and sell its products; •differing local product preferences and product requirements; •trade protection measures and import or export licensing requirements; •difficulty in establishing, staffing, and managing operations; •differing labor regulations; •potentially negative consequences from changes in or interpretations of tax laws; •political and economic instability, including sovereign debt issues; •price and currency exchange controls, limitations on participation in local enterprises, expropriation, nationalization, and other governmental action; •inflation, recession and fluctuations in interest rates; •compulsory licensing or diminished protection of intellectual property; and •potential penalties or other adverse consequences for violations of anti-corruption, anti-bribery and other similar laws and regulations, including the Foreign Corrupt Practices Act and the U.K. Bribery Act. Events contemplated by these risks may, individually or in the aggregate, have a material adverse effect on Abbott's revenues and profitability. Other factors can have a material adverse effect on Abbott's future profitability and financial condition. Many other factors can affect Abbott's profitability and its financial condition, including: •changes in or interpretations of laws and regulations, including changes in accounting standards, taxation requirements, product marketing application standards, product labeling, source, and use laws, and environmental laws; •differences between the fair value measurement of assets and liabilities and their actual value, particularly for pensions, retiree health care, stock compensation, intangibles, and goodwill; and for contingent liabilities such as litigation, the absence of a recorded amount, or an amount recorded at the minimum, compared to the actual amount; •changes in the rate of inflation (including the cost of raw materials, commodities, and supplies), interest rates, market value of Abbott's equity investments, and the performance of investments held by Abbott or Abbott's employee benefit trusts; •changes in the creditworthiness of counterparties that transact business with or provide services to Abbott or Abbott's employee benefit trusts; •changes in business, economic, and political conditions, including: war, political instability, terrorist attacks, the threat of future terrorist activity and related military action; natural disasters; the cost and availability of insurance due to any of the foregoing events; labor disputes, strikes, slow-downs, or other forms of labor or union activity; and pressure from third-party interest groups; •changes in Abbott's business units and investments and changes in the relative and absolute contribution of each to earnings and cash flow resulting from evolving business strategies, changing product mix, changes in tax laws or tax rates both in the U.S. and abroad and opportunities existing now or in the future; •changes in the buying patterns of a major distributor, retailer, or wholesale customer resulting from buyer purchasing decisions, pricing, seasonality, or other factors, or other problems with licensors, suppliers, distributors, and business partners; •changes in credit markets impacting Abbott's ability to obtain financing for its business operations; and •legal difficulties, any of which could preclude or delay commercialization of products or adversely affect profitability, including claims asserting statutory or regulatory violations, and adverse litigation decisions. CAUTIONARY STATEMENT REGARDING FORWARD-LOOKING STATEMENTS This Form 10-K contains forward-looking statements that are based on management's current expectations, estimates, and projections. Words such as \"expects,\" \"anticipates,\" \"intends,\" \"plans,\" \"believes,\" \"seeks,\" \"estimates,\" \"forecasts,\" variations of these words, and similar expressions are intended to identify these forward-looking statements. Certain factors, including but not limited to those identified under \"Item 1A. Risk Factors\" of this Form 10-K, may cause actual results to differ materially from current expectations, estimates, projections, forecasts, and from past results. No assurance can be made that any expectation, estimate, or projection contained in a forward-looking statement will be achieved or will not be affected by the factors cited above or other future events. Abbott undertakes no obligation to release publicly any revisions to forward-looking statements as the result of subsequent events or developments, except as required by law. ITEM 1B.\nITEM 1B. UNRESOLVED STAFF COMMENTS None. ITEM 2.\nITEM 2. PROPERTIES Abbott's corporate offices are located\n\n" + ] + } + ], + "source": [ + "print(\"Num examples:\", len(df))\n", + "print(\"First example:\")\n", + "for ex in df['text']: \n", + " print(ex)\n", + " print() \n", + " break \n", + "\n", + "if not integrity_check(temporary_mds_output_path): \n", + " raise ValueError(\"MDS has not been created correctly. There are missing shards!\")\n", + "\n", + "# Sanity Check\n", + "import numpy as np\n", + "from streaming import StreamingDataset\n", + "from transformers import AutoTokenizer\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(FT_API_args.model)\n", + "tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace\n", + "mds_dataset = StreamingDataset(local=temporary_mds_output_path, shuffle=False)\n", + "for i in range(5):\n", + " l = np.frombuffer(mds_dataset[i]['tokens'], dtype=np.int64)\n", + " print(''.join(tokenizer.decode(l)))\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "298eb990-9160-4e1b-958f-33dd2c11b54b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Cost Estimation" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5bc58cb3-0a19-4512-9584-642f0a2be4df", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset has ~985088 tokens that will be charged for during training\nBy default, you'll train for 3 epochs on this dataset\nBy default, you'll be charged for ~2955264 tokens\n" + ] + } + ], + "source": [ + "MAX_TOKENS_PER_EXAMPLE = FT_API_args.context_length if FT_API_args.context_length is not None else 4096\n", + "TARGET_EPOCHS = FT_API_args.training_duration if FT_API_args.training_duration is not None else 1 \n", + "n_epochs = TARGET_EPOCHS\n", + "n_train_examples = len(raw_dataset)\n", + "\n", + "n_billing_tokens_in_dataset = len(mds_dataset) * FT_API_args.context_length \n", + "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n", + "print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n", + "print(f\"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8775fed8-6440-4a20-82f3-59b6cff73421", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "validate_and_tokenize_data", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 378a4e097c1fc1cddb0c931f385191f775335a8b Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 12 Jan 2024 00:03:37 -0800 Subject: [PATCH 53/63] update --- llmfoundry/utils/validation_utils.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index 57a5521079..d9d74b10b2 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -875,7 +875,8 @@ def pandas_processing_fn(df: pd.DataFrame, try: from pyspark import TaskContext - from pyspark.sql.dataframe import DataFrame as SparkDataFrame + from pyspark.sql.dataframe import DataFrame as SparkSqlDataFrame + from pyspark.sql.connect.dataframe import DataFrame as SparkConnDataFrame from pyspark.sql.types import (ArrayType, BinaryType, BooleanType, ByteType, DateType, DayTimeIntervalType, DecimalType, DoubleType, FloatType, IntegerType, LongType, MapType, ShortType, StringType, @@ -930,7 +931,10 @@ def pandas_processing_fn(df: pd.DataFrame, 'string' : 'str' } -def infer_dataframe_schema(dataframe: Union[SparkDataFrame, DaskDataFrame], +def isSparkDataFrame(dataframe: Union[SparkSqlDataFrame, SparkConnDataFrame, DaskDataFrame]): + return isinstance(dataframe, SparkSqlDataFrame) or isinstance(dataframe, SparkConnDataFrame) + +def infer_dataframe_schema(dataframe: Union[SparkSqlDataFrame, SparkConnDataFrame, DaskDataFrame], user_defined_cols: Optional[Dict[str, Any]] = None) -> Optional[Dict]: """Retrieve schema to construct a dictionary or do sanity check for MDSWriter. @@ -984,7 +988,7 @@ def map_dask_dtype(dask_data_type: Any) -> str: if user_dtype not in mds_supported_dtypes: raise ValueError(f'{user_dtype} is not supported by MDSWriter') - if isinstance(dataframe, SparkDataFrame): + if isSparkDataFrame(dataframe) actual_spark_dtype = dataframe.schema[col_name].dataType mapped_mds_dtype = map_spark_dtype(actual_spark_dtype) else: @@ -999,7 +1003,7 @@ def map_dask_dtype(dask_data_type: Any) -> str: schema_dict = {} - if isinstance(dataframe, SparkDataFrame): + if isSparkDataFrame(dataframe): schema = dataframe.schema for field in schema: dtype = map_spark_dtype(field.dataType) @@ -1015,7 +1019,7 @@ def map_dask_dtype(dask_data_type: Any) -> str: return schema_dict -def dataframeToMDS(dataframe: Union[SparkDataFrame, DaskDataFrame], +def dataframeToMDS(dataframe: Union[SparkSqlDataFrame, SparkConnDataFrame, DaskDataFrame], merge_index: bool = True, mds_kwargs: Optional[Dict[str, Any]] = None, udf_iterable: Optional[Callable] = None, @@ -1030,7 +1034,7 @@ def dataframeToMDS(dataframe: Union[SparkDataFrame, DaskDataFrame], return dataframe_to_mds(dataframe, merge_index, mds_kwargs, udf_iterable, udf_kwargs) -def dataframe_to_mds(dataframe: Union[SparkDataFrame, DaskDataFrame], +def dataframe_to_mds(dataframe: Union[SparkSqlDataFrame, SparkConnDataFrame, DaskDataFrame], merge_index: bool = True, mds_kwargs: Optional[Dict[str, Any]] = None, udf_iterable: Optional[Callable] = None, @@ -1165,11 +1169,11 @@ def write_mds_spark(iterator: Iterable): if dataframe is None: raise ValueError(f'Input dataframe is None!') - if not (isinstance(dataframe, SparkDataFrame) or isinstance(dataframe, DaskDataFrame)): + if not isSparkDataFrame(dataframe) or not isinstance(dataframe, DaskDataFrame)): raise ValueError(f'dataframe_to_mds only takes Spark dataframe or Dask dataframe!') - if (isinstance(dataframe, SparkDataFrame) and dataframe.isEmpty()) or (isinstance(dataframe, DaskDataFrame) and len(dataframe.index)==0): - raise ValueError(f'Input dataframe is Empty1') + if (isSparkDataFrame(dataframe) and dataframe.isEmpty()) or (isinstance(dataframe, DaskDataFrame) and len(dataframe.index)==0): + print(f'Return. Input dataframe is Empty! Nothing to be done!') if not mds_kwargs: mds_kwargs = {} @@ -1207,7 +1211,7 @@ def write_mds_spark(iterator: Iterable): else: mds_path = (cu.local, cu.remote) - if isinstance(dataframe, SparkDataFrame): + if isSparkDataFrame(dataframe): # Prepare partition schema result_schema = StructType([ StructField('mds_path_local', StringType(), False), @@ -1228,7 +1232,7 @@ def write_mds_spark(iterator: Iterable): keep_local_files = False if merge_index: - if isinstance(dataframe, SparkDataFrame): + if isSparkDataFrame(dataframe): index_files = list(set([(row['mds_path_local'], row['mds_path_remote']) for row in partitions])) else: index_files = list(set([(row[1]['mds_path_local'], row[1]['mds_path_remote']) for row in partitions.iterrows()])) @@ -1239,7 +1243,7 @@ def write_mds_spark(iterator: Iterable): shutil.rmtree(cu.local, ignore_errors=True) sum_fail_count = 0 - if isinstance(dataframe, SparkDataFrame): + if isSparkDataFrame(dataframe): for row in partitions: sum_fail_count += row['fail_count'] From af6e9aa99a2a85aafccc5cb8d4e21707212cf87a Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Fri, 12 Jan 2024 08:05:05 +0000 Subject: [PATCH 54/63] update --- .../data_prep/validate_and_tokenize_data.py | 450 +++++------------- 1 file changed, 125 insertions(+), 325 deletions(-) diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py index 8f96561e84..c05bd7f8dd 100644 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ b/scripts/data_prep/validate_and_tokenize_data.py @@ -1,20 +1,9 @@ # Databricks notebook source # MAGIC %md -# MAGIC Copyright 2022 MosaicML LLM Foundry authors. -# MAGIC SPDX-License-Identifier: Apache-2.0 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC JIRA: https://databricks.atlassian.net/jira/software/c/projects/STR/issues/STR-141?filter=allissues - -# COMMAND ---------- - -# MAGIC %md -# MAGIC # Warning: Important Alert Regarding the Script Usage +# MAGIC # FM FT API: Validation and Cost Estimation # MAGIC # MAGIC #### Usage Scenario: -# MAGIC This script is particularly designed for Databricks' customers who have access to Databricks notebook and UC. Our customers may find this script useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process. +# MAGIC This notebook goes hand-in-hand with Databricks-Mosaicml's FT API. Our customers may find it useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process. # MAGIC # MAGIC #### Script Purpose: # MAGIC - **Not for Training**: This script is not utilized during the training process. @@ -28,6 +17,7 @@ # MAGIC # MAGIC #### User Defines: # MAGIC - The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption? +# MAGIC - For the reference, FT API expects following # MAGIC ``` # MAGIC cfg = { # MAGIC model: str, @@ -42,52 +32,25 @@ # MAGIC learning_rate: Optional[float] = None, # MAGIC context_length: Optional[int] = None, # MAGIC experiment_trackers: Optional[List[Dict]] = None, -# MAGIC data_prep_config: Optional[Dict] = None, # MAGIC disable_credentials_check: Optional[bool] = None, # MAGIC timeout: Optional[float] = 10, # MAGIC future: Literal[False] = False, # MAGIC } # MAGIC ``` -# MAGIC -# MAGIC #### Checks Include: -# MAGIC - check input dataset: -# MAGIC 1) verify if dataset input format is valid (need to be one of these: Huggingface, delta table, dbfs:/Volumes, cloud path); -# MAGIC - check HF input location: -# MAGIC 1) load dataset info and check if it is accessible; -# MAGIC 2) verify if the split exists. -# MAGIC - check cloud path location: -# MAGIC 1) check the cloud prefix is compliant with composers' object store supports (gs, s3, oci) -# MAGIC 2) check if list objects returns nothing. -# MAGIC - count_tokens: -# MAGIC 1) For IFT task: validate tokenization by running tokenizer + filter on the entire dataset. count the number of tokens. Throws error if there are any empty responses or prompts -# MAGIC 2) For CPT task: call donwload_text_to_mds.py and count the resulted mds dataset. Note this could take a long time. -# MAGIC -# MAGIC #### To-dos: -# MAGIC - Map the model to its expected eos_text / bos_text format automatically [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703644153357929?thread_ts=1703643155.904289&cid=C05K29T9NBF) -# MAGIC - Automate tokenization for CPT. it is always really standard: sequence -> concat(tok(BOS), tok(sequence), tok(EOS)), and then concatenate sequences. [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703698056000399?thread_ts=1703643155.904289&cid=C05K29T9NBF) -# MAGIC - Add ``preprocessing_fn`` here. -- We don't need to. FT API does not expose preprocessing_fn. -# MAGIC - Add a sample_ratio parameter so users can run the validation on a portion of the whole dataest then estimate by the scaling factor. -# MAGIC - Put utility function in a validation branch. -# MAGIC - - -# COMMAND ---------- - -# %pip install git+https://github.com/mosaicml/llm-foundry.git@byod/data_validation -%pip install git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation # COMMAND ---------- -dbutils.library.restartPython() +# MAGIC %md +# MAGIC #### Install llmfoundry Validation Branch # COMMAND ---------- -# MAGIC %md -# MAGIC # Instruction Fine Tuning +# %pip install git+https://github.com/mosaicml/llm-foundry.git@byod/data_validation +%pip install git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation # COMMAND ---------- -# MAGIC %md -# MAGIC #### All Utility Functions +dbutils.library.restartPython() # COMMAND ---------- @@ -108,228 +71,42 @@ from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) from llmfoundry.utils import build_tokenizer -from llmfoundry.utils import (create_om_cfg, token_counts_and_validation, +from llmfoundry.utils import (create_om_cfg, token_counts_and_validation, token_counts, check_HF_datasets, is_hf_dataset_path, is_uc_delta_table, pandas_processing_fn, integrity_check, convert_text_to_mds, - _args_str) + _args_str, plot_hist) from llmfoundry.data import ConcatTokensDataset from streaming.base.storage.download import download_file from streaming.base.storage.upload import CloudUploader from streaming.base.converters import dataframe_to_mds - -# def create_om_cfg(FT_API_args: Namespace): -# task_type = FT_API_args.task_type - -# train_data_path = FT_API_args.train_data_path -# split = 'train' - -# if is_hf_dataset_path(FT_API_args.train_data_path): -# train_data_path, split = '/'.join(FT_API_args.train_data_path.split('/')[:2]), FT_API_args.train_data_path.split('/')[-1] - -# model = FT_API_args.model -# max_seq_len = FT_API_args.context_length - -# common_args = { -# 'drop_last': False, -# 'num_workers': 2, -# 'prefetch_factor': 2, -# 'pin_memory': False, -# 'persistent_workers': False, -# 'timeout': 0 -# } -# if task_type == 'INSTRUCTION_FINETUNE': -# cfg = om.create({ -# 'dataset': { -# 'hf_name': train_data_path, -# 'split': split, -# 'max_seq_len': max_seq_len, -# 'decoder_only_format': True, -# 'allow_pad_trimming': False, -# 'shuffle': True, -# }, -# **common_args -# }) - -# else: -# cfg = om.create({ -# 'name': 'finetuning', -# 'dataset': { -# 'remote': train_data_path, -# 'local': train_data_path, -# 'split': split, -# 'max_seq_len': max_seq_len, -# 'decoder_only_format': True, -# 'allow_pad_trimming': False, -# 'packing_ratio': None, -# 'shuffle': True, -# }, -# **common_args -# }) - -# tokenizer = build_tokenizer( -# tokenizer_name=model, -# tokenizer_kwargs={'model_max_length': max_seq_len}, -# ) - -# return cfg, tokenizer - -# def token_counts_and_validation(FT_API_args): -# from llmfoundry.data.finetuning import build_finetuning_dataloader - -# cfg, tokenizer = create_om_cfg(FT_API_args) - -# device_batch_size = 1 -# dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) -# dataloader = dataspec.dataloader -# token_counting_func = dataspec.get_num_tokens_in_batch - -# total_tokens = [] -# for batch in dataloader: -# n_batch_tokens = token_counting_func(batch) -# if n_batch_tokens == 0: -# raise ValueError("Empty train sample") -# total_tokens.append(n_batch_tokens) -# return total_tokens - -# #---------------------------------------- IFT ---------------------------------------- # - -# def check_HF_datasets(dataset_names_with_splits: list): -# token = os.environ.get('HUGGING_FACE_HUB_TOKEN') -# for dataset_name_with_split in dataset_names_with_splits: -# dataset_name, split = os.path.split(dataset_name_with_split) -# # make sure we have a dataset and split -# if not dataset_name or not split: -# return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." -# # check user access to the dataset -# try: -# _ = dataset_info(dataset_name) -# except: -# token_warning = '' -# if not token: -# token_warning = ' If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf.' -# return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning -# # check that split exists -# try: -# splits = get_dataset_split_names(dataset_name) -# except: # error raised in the case of multiple subsets -# return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets.' -# if split not in splits: -# return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found.' -# return True, '' - - -# def is_hf_dataset_path(path: str): -# """Check if a given string is a dataset path used by Hugging Face. - -# Args: -# path (str): The string to be checked. - -# Returns: -# bool: True if the string is a dataset path, False otherwise. -# """ -# # Regular expression to match the dataset path pattern -# pattern = r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$' - -# return bool(re.match(pattern, path)) - -# def is_uc_delta_table(name: str): -# """name is in the form of catalog.scheme.tablename - -# Args: -# name (str): a string folder/file/table path -# Return: -# (bool): True if name is valid UC delta table format -# """ -# return '.' in name and '/' not in name and '\\' not in name and len(name.split('.'))==3 - -# #---------------------------------------- CPT ---------------------------------------- # - -# def pandas_processing_fn(df: pd.DataFrame, -# **args: Any) -> Iterable[Dict[str, bytes]]: -# """Tokenize helper function for dataframe_to_mds. - -# Args: -# df (pandas.DataFrame): The input pandas DataFrame that needs to be processed. -# **args : Additional arguments to be passed to the 'process_some_data' function during processing. - -# Returns: -# iterable obj -# """ -# hf_dataset = hf_datasets.Dataset.from_pandas(df=df) -# tokenizer = AutoTokenizer.from_pretrained(args['tokenizer']) -# tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace -# dataset = ConcatTokensDataset( -# hf_dataset=hf_dataset, -# max_length=args.get('concat_tokens', None), -# tokenizer=tokenizer, -# eos_text=args.get('eos_text', None), -# bos_text=args.get('bos_text', None), -# no_wrap=args.get('no_wrap', None), -# ) - -# for sample in dataset: # pyright: ignore -# yield sample - -# def integrity_check(out: Union[str, Tuple[str, str]]): -# """Check if the index file has integrity. - -# If index is a cloud url, first download it to a temp local file. - -# Args: -# out (Union[str, Tuple[str,str]]): MDS dataset path -# """ - -# def count_shards(mds_root: str): -# n_shard_files = 0 -# cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) -# for o in cu.list_objects(): -# if o.endswith('.mds'): -# n_shard_files += 1 -# return n_shard_files - -# cu = CloudUploader.get(out, keep_local=True, exist_ok=True) - -# with tempfile.TemporaryDirectory() as temp_dir: -# if cu.remote: -# download_file(os.path.join(cu.remote, 'index.json'), -# os.path.join(temp_dir, 'index.json'), -# timeout=60) -# actual_n_shard_files = count_shards(cu.remote) -# local_merged_index_path = os.path.join(temp_dir, 'index.json') -# else: -# local_merged_index_path = os.path.join(cu.local, 'index.json') -# actual_n_shard_files = count_shards(cu.local) - -# merged_index = json.load(open(local_merged_index_path, 'r')) -# n_shard_files = len( -# {b['raw_data']['basename'] for b in merged_index['shards']}) -# return n_shard_files == actual_n_shard_files +# COMMAND ---------- +# MAGIC %md +# MAGIC # Instruction Fine Tuning # COMMAND ---------- # MAGIC %md # MAGIC #### User Defines -# MAGIC Use the same input arguments you will want to provide to FT API # COMMAND ---------- FT_API_args = Namespace( model='EleutherAI/gpt-neox-20b', - train_data_path= 'tatsu-lab/alpaca/train', # 'main.streaming.random_large_table', # # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', - save_folder= 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', + train_data_path= 'main.streaming.random_large_table', # '/Volumes/main/mosaic_hackathon/managed-volume/IFT/train.jsonl', # 'tatsu-lab/alpaca/train', # , # 'tatsu-lab/alpaca/train', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', task_type='INSTRUCTION_FINETUNE', training_duration=3, context_length=2048, ) -temporary_jsonl_data_path = '/tmp/ft_data/train/' -os.environ['HF_ASSETS_CACHE'] = '/tmp/' -os.environ['HF_HOME'] = '/tmp/' -os.environ['HF_HUB_CACHE'] = '/tmp/' +temporary_jsonl_data_path = '/tmp/ft_data_11Jan24_2/train' +# os.environ['HF_ASSETS_CACHE'] = '/tmp/' +# os.environ['HF_HOME'] = '/tmp/' +# os.environ['HF_HUB_CACHE'] = '/tmp/' os.environ['HF_DATASETS_CACHE'] = '/tmp/' +os.makedirs(temporary_jsonl_data_path, exist_ok=True) # COMMAND ---------- @@ -352,35 +129,39 @@ raw_dataset = None -if FT_API_args.train_data_path.endswith('.jsonl') and os.path.exists(FT_API_args.train_data_path): - data_path = FT_API_args.train_data_path - raw_dataset = datasets.load_dataset('json', data_path) - if is_hf_dataset_path(FT_API_args.train_data_path): check_HF_datasets(FT_API_args.train_data_path) dataset_id, split = '/'.join(FT_API_args.train_data_path.split('/')[:2]), FT_API_args.train_data_path.split('/')[-1] raw_dataset = datasets.load_dataset(dataset_id, split=split) - -if is_uc_delta_table(FT_API_args.train_data_path): - delta_table_name = FT_API_args.train_data_path - df = spark.read.table(delta_table_name) - df = df.toPandas() - df.rename(columns={'prompts': 'prompt', 'responses': 'response'}, inplace=True) - df.to_json(os.path.join(temporary_jsonl_data_path, 'ift.jsonl'), orient='records', lines=True) - raw_dataset = datasets.Dataset.from_pandas(df) - FT_API_args.train_data_path = temporary_jsonl_data_path +else: + if is_uc_delta_table(FT_API_args.train_data_path): + df = spark.read.table(FT_API_args.train_data_path).toPandas() + df.to_json(os.path.join(temporary_jsonl_data_path, 'data.jsonl'), orient='records', lines=True) + raw_dataset = datasets.Dataset.from_pandas(df) + FT_API_args.train_data_path = temporary_jsonl_data_path + else: + # train_data_path is a jonsl file (local/remote) + from composer.utils import dist, get_file, parse_uri + data_path = FT_API_args.train_data_path + backend, _, _ = parse_uri(data_path) + if backend not in ['', None]: # It's a remote path, download before loading it + with tempfile.TemporaryDirectory() as tmp_dir: + destination = os.path.join(tmp_dir, 'data.jsonl') + get_file(data_path, destination) + df = pd.read_json(destination, orient='records', lines=True) + else: + df = pd.read_json(data_path, orient='records', lines=True) + + raw_dataset = datasets.Dataset.from_pandas(df) + FT_API_args.train_data_path = os.path.dirname(data_path) if raw_dataset is None: raise RuntimeError("Can't find a proper ingestion method") # COMMAND ---------- -!mkdir -p {temporary_jsonl_data_path} - -# COMMAND ---------- - # MAGIC %md -# MAGIC #### Validation and Statistics +# MAGIC #### Validation # COMMAND ---------- @@ -392,23 +173,31 @@ print() break +_ALLOWED_RESPONSE_KEYS = {'response', 'completion'} +_ALLOWED_PROMPT_KEYS = {'prompt'} format_errors = defaultdict(int) for ex in raw_dataset: if not isinstance(ex, dict): format_errors["data_type"] += 1 continue - - prompts = ex.get("prompt", None) - if not prompts: + + found = False + for key in _ALLOWED_PROMPT_KEYS: + prompts = ex.get(key, None) + if prompts: + found = True + if not found: format_errors["missing_prompt"] += 1 - continue - responses = ex.get("response", None) - if not responses: + found = False + for key in _ALLOWED_RESPONSE_KEYS: + responses = ex.get("response", None) + if responses: + found = True + if not found: format_errors["missing_response"] += 1 - continue - + if format_errors: print("Oops! Found errors:") for k, v in format_errors.items(): @@ -425,13 +214,9 @@ # COMMAND ---------- -MAX_TOKENS_PER_EXAMPLE = FT_API_args.context_length if FT_API_args.context_length is not None else 4096 -TARGET_EPOCHS = FT_API_args.training_duration if FT_API_args.training_duration is not None else 1 -n_epochs = TARGET_EPOCHS -n_train_examples = len(raw_dataset) - -batch_tokens = token_counts_and_validation(FT_API_args) -n_billing_tokens_in_dataset = sum(batch_tokens) +n_epochs = FT_API_args.training_duration if FT_API_args.training_duration is not None else 1 +batch_tokens = token_counts3(FT_API_args) +n_billing_tokens_in_dataset = sum(batch_tokens['ntokens']) print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training") print(f"By default, you'll train for {n_epochs} epochs on this dataset") @@ -439,6 +224,16 @@ # COMMAND ---------- +plot_hist(pd.Series(batch_tokens['ntokens'])) + +# COMMAND ---------- + +# all_tokens = token_counts_and_validation(FT_API_args) +# plot_hist(pd.Series(all_tokens)) +# pd.Series(all_tokens).max(), max(batch_tokens['ntokens']) + +# COMMAND ---------- + # MAGIC %md # MAGIC # Continued Pretrain @@ -451,54 +246,32 @@ FT_API_args = Namespace( model='EleutherAI/gpt-neox-20b', - train_data_path= 'dbfs:/xiaohan-test/test_cpt/', - save_folder= 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', + train_data_path= '/Volumes/main/mosaic_hackathon/managed-volume/ABT', task_type='CONTINUED_PRETRAIN', training_duration=3, context_length=2048, ) - -temporary_mds_output_path = '/tmp/xiaohan-test/test_mds' +temporary_mds_output_path = '/tmp/xiaohan-test-11Jan24_2/test_mds' # COMMAND ---------- # MAGIC %md -# MAGIC #### Data Loading (from text to MDS) +# MAGIC #### Ingestion, Tokenization and Materialization # MAGIC -# MAGIC Copy [llmfoundry/scripts/data_prep/convert_text_to_mds.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/data_prep/convert_text_to_mds.py) here and run the cell below - -# COMMAND ---------- - -from convert_text_to_mds import convert_text_to_mds, parse_args, _args_str - -# check if train_data_path is a valid object store that composer supports -cfg, tokenizer = create_om_cfg(FT_API_args) - -input_folder = FT_API_args.train_data_path -output_folder = FT_API_args.save_folder -concat_tokens = FT_API_args.context_length -tokenizer_name = FT_API_args.model - -# Run convert_text_to_mds.py and dump MDS dataset to "save_folder" -args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) -convert_text_to_mds(tokenizer_name=args.tokenizer, - output_folder=temporary_mds_output_path, - input_folder=args.input_folder, - concat_tokens=args.concat_tokens, - eos_text=args.eos_text, - bos_text=args.bos_text, - no_wrap=args.no_wrap, - compression=args.compression, - processes=args.processes, - reprocess=args.reprocess, - args_str=_args_str(args)) - +# MAGIC CPT takes a folder of txt files as input. It tokenize the text fields and materialize as a streaming dataset of MDS format. +# MAGIC +# MAGIC FT API uses [llmfoundry/scripts/data_prep/convert_text_to_mds.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/data_prep/convert_text_to_mds.py) to download all the txt files and convert them to MDS. +# MAGIC +# MAGIC In this notebook, we provide two additional approaches via Spark and Dask. +# MAGIC +# MAGIC **Warning** CPT datasets are normally much larger than IFT, so the tokenization and materialization can be very time consuming. # COMMAND ---------- -# MAGIC %md -# MAGIC #### Alternative: Delta Ingestion -# MAGIC Once you have credentials set up with dbutils.secret or init script, You can ingest the folder of txt files and have the schema automatically inferred. The result is a spark dataframe and can be converted to MDS while Streaming's utility +# MAGIC %md +# MAGIC **1. Delta Ingestion --> Spark Dataframe:** +# MAGIC +# MAGIC If you don't have a single-user-assigned cluster and DBR < 14.3, move on to option-2. Otherwise, you can leverage Delta Ingestion's tools to ingest the folder of txt files as a Spark dataframe and have the schema automatically inferred. # COMMAND ---------- @@ -506,8 +279,7 @@ output_location = FT_API_args.train_data_path + '/*.txt' df = spark.sql("SELECT * FROM read_files('%s')" % output_location).withColumnRenamed('value', 'text') -df.show() - +df.show(2) mds_kwargs = { 'out': temporary_mds_output_path, 'columns': { @@ -532,6 +304,44 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC **2. Dask.bag --> Dask.DataFrame:** +# MAGIC +# MAGIC If you are on UC enabled clusters where mapInPandas does not work, you can try Dask. Dask uses the current node as a ```Local Cluster``` + +# COMMAND ---------- + +import dask.bag as db + +input_folder = FT_API_args.train_data_path +pattern = input_folder + '/*.txt' +b = db.read_text(pattern, linedelimiter='\n', blocksize='128MiB') +df = b.to_dataframe(columns = ['text']) +df = df[df.text != '\n'] + +mds_kwargs = { + 'out': temporary_mds_output_path, + 'columns': { + 'tokens': 'bytes' + }, + 'keep_local': True, +} +udf_kwargs = { + 'concat_tokens': FT_API_args.context_length, + 'tokenizer': FT_API_args.model, + 'eos_text': '', + 'compression': 'zstd', + 'no_wrap': False, + 'bos_text': '', +} +df_to_mds(df, + merge_index=True, + mds_kwargs=mds_kwargs, + udf_iterable=pandas_processing_fn2, + udf_kwargs=udf_kwargs) + +# COMMAND ---------- + # MAGIC %md # MAGIC #### Validation @@ -552,7 +362,7 @@ from streaming import StreamingDataset tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace -dataset = StreamingDataset(local=mds_output_path, shuffle=False) +mds_dataset = StreamingDataset(local=mds_output_path, shuffle=False) for i in range(5): l = np.frombuffer(dataset[i]['tokens'], dtype=np.int64) print(''.join(tokenizer.decode(l))) @@ -570,17 +380,7 @@ n_epochs = TARGET_EPOCHS n_train_examples = len(raw_dataset) -batch_tokens = token_counts_and_validation(FT_API_args) -n_billing_tokens_in_dataset = sum(batch_tokens) - +n_billing_tokens_in_dataset = len(mds_dataset) * FT_API_args.context_length print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training") print(f"By default, you'll train for {n_epochs} epochs on this dataset") print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens") - -# COMMAND ---------- - - - -# COMMAND ---------- - - From 4e286ecdbd44808607faf99f284bedf933e2eb98 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 12 Jan 2024 00:09:16 -0800 Subject: [PATCH 55/63] remove script and tests, keep notebook --- .../data_prep/validate_and_tokenize_data.py | 386 ------------------ .../test_validate_and_tokenize_data.py | 131 ------ 2 files changed, 517 deletions(-) delete mode 100644 scripts/data_prep/validate_and_tokenize_data.py delete mode 100644 tests/a_scripts/data_prep/test_validate_and_tokenize_data.py diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py deleted file mode 100644 index c05bd7f8dd..0000000000 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ /dev/null @@ -1,386 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC # FM FT API: Validation and Cost Estimation -# MAGIC -# MAGIC #### Usage Scenario: -# MAGIC This notebook goes hand-in-hand with Databricks-Mosaicml's FT API. Our customers may find it useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process. -# MAGIC -# MAGIC #### Script Purpose: -# MAGIC - **Not for Training**: This script is not utilized during the training process. -# MAGIC - **Ad-Hoc Validation**: It serves as an ad-hoc utility for users to run independently prior to starting fine-tuning. -# MAGIC - **Data Verification**: Its primary function is to validate the user's data before they invoke the Fine-Tuning (FT) API. -# MAGIC - **Cost Estimation**: Users can estimate the cost implications with this script. -# MAGIC -# MAGIC #### Note on Long-Term Solution: -# MAGIC - **Temporary Measure**: This script is a stop-gap solution. -# MAGIC - **Future Development**: We are in the process of developing a long-term data preparation service, which will eventually replace this script. -# MAGIC -# MAGIC #### User Defines: -# MAGIC - The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption? -# MAGIC - For the reference, FT API expects following -# MAGIC ``` -# MAGIC cfg = { -# MAGIC model: str, -# MAGIC train_data_path: str, -# MAGIC save_folder: str, -# MAGIC *, -# MAGIC task_type: Optional[str] = "INSTRUCTION_FINETUNE", -# MAGIC eval_data_path: Optional[str] = None, -# MAGIC eval_prompts: Optional[List[str]] = None, -# MAGIC custom_weights_path: Optional[str] = None, -# MAGIC training_duration: Optional[str] = None, -# MAGIC learning_rate: Optional[float] = None, -# MAGIC context_length: Optional[int] = None, -# MAGIC experiment_trackers: Optional[List[Dict]] = None, -# MAGIC disable_credentials_check: Optional[bool] = None, -# MAGIC timeout: Optional[float] = 10, -# MAGIC future: Literal[False] = False, -# MAGIC } -# MAGIC ``` - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### Install llmfoundry Validation Branch - -# COMMAND ---------- - -# %pip install git+https://github.com/mosaicml/llm-foundry.git@byod/data_validation -%pip install git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation - -# COMMAND ---------- - -dbutils.library.restartPython() - -# COMMAND ---------- - -import os -import re -import json -import tempfile -import numpy as np -import pandas as pd -from collections import defaultdict -from omegaconf import OmegaConf as om -from argparse import ArgumentParser, Namespace -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union - -import datasets -from datasets import get_dataset_split_names -from huggingface_hub import dataset_info - -from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, parse_uri) -from llmfoundry.utils import build_tokenizer -from llmfoundry.utils import (create_om_cfg, token_counts_and_validation, token_counts, - check_HF_datasets, is_hf_dataset_path, is_uc_delta_table, - pandas_processing_fn, integrity_check, convert_text_to_mds, - _args_str, plot_hist) -from llmfoundry.data import ConcatTokensDataset - -from streaming.base.storage.download import download_file -from streaming.base.storage.upload import CloudUploader -from streaming.base.converters import dataframe_to_mds - -# COMMAND ---------- - -# MAGIC %md -# MAGIC # Instruction Fine Tuning - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### User Defines - -# COMMAND ---------- - -FT_API_args = Namespace( - model='EleutherAI/gpt-neox-20b', - train_data_path= 'main.streaming.random_large_table', # '/Volumes/main/mosaic_hackathon/managed-volume/IFT/train.jsonl', # 'tatsu-lab/alpaca/train', # , # 'tatsu-lab/alpaca/train', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', - task_type='INSTRUCTION_FINETUNE', - training_duration=3, - context_length=2048, -) - -temporary_jsonl_data_path = '/tmp/ft_data_11Jan24_2/train' -# os.environ['HF_ASSETS_CACHE'] = '/tmp/' -# os.environ['HF_HOME'] = '/tmp/' -# os.environ['HF_HUB_CACHE'] = '/tmp/' -os.environ['HF_DATASETS_CACHE'] = '/tmp/' -os.makedirs(temporary_jsonl_data_path, exist_ok=True) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### Data Loading -# MAGIC -# MAGIC The IFT data needs to stay with a format -# MAGIC ``` -# MAGIC prompt: xxx -# MAGIC response or completion: yyy -# MAGIC ``` -# MAGIC -# MAGIC Based on FT_API_args.train_data_path, we will select an ingestion method from three options. -# MAGIC -# MAGIC - Option-1. Your data is a JSONL file which stores in an object store supported by Composer. [Example file to-be-added](todo - add a link to such a file) -# MAGIC - Option-2. You provide a Huggingface dataset ID. Note you need to provide a split as well. [Example dataset link to-be-added](huggingface.co) -# MAGIC - Option-3. You have a delta table. - -# COMMAND ---------- - -raw_dataset = None - -if is_hf_dataset_path(FT_API_args.train_data_path): - check_HF_datasets(FT_API_args.train_data_path) - dataset_id, split = '/'.join(FT_API_args.train_data_path.split('/')[:2]), FT_API_args.train_data_path.split('/')[-1] - raw_dataset = datasets.load_dataset(dataset_id, split=split) -else: - if is_uc_delta_table(FT_API_args.train_data_path): - df = spark.read.table(FT_API_args.train_data_path).toPandas() - df.to_json(os.path.join(temporary_jsonl_data_path, 'data.jsonl'), orient='records', lines=True) - raw_dataset = datasets.Dataset.from_pandas(df) - FT_API_args.train_data_path = temporary_jsonl_data_path - else: - # train_data_path is a jonsl file (local/remote) - from composer.utils import dist, get_file, parse_uri - data_path = FT_API_args.train_data_path - backend, _, _ = parse_uri(data_path) - if backend not in ['', None]: # It's a remote path, download before loading it - with tempfile.TemporaryDirectory() as tmp_dir: - destination = os.path.join(tmp_dir, 'data.jsonl') - get_file(data_path, destination) - df = pd.read_json(destination, orient='records', lines=True) - else: - df = pd.read_json(data_path, orient='records', lines=True) - - raw_dataset = datasets.Dataset.from_pandas(df) - FT_API_args.train_data_path = os.path.dirname(data_path) - -if raw_dataset is None: - raise RuntimeError("Can't find a proper ingestion method") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### Validation - -# COMMAND ---------- - -# Initial dataset stats -print("Num examples:", len(raw_dataset)) -print("First example:") -for ex in raw_dataset: - print(ex) - print() - break - -_ALLOWED_RESPONSE_KEYS = {'response', 'completion'} -_ALLOWED_PROMPT_KEYS = {'prompt'} -format_errors = defaultdict(int) - -for ex in raw_dataset: - if not isinstance(ex, dict): - format_errors["data_type"] += 1 - continue - - found = False - for key in _ALLOWED_PROMPT_KEYS: - prompts = ex.get(key, None) - if prompts: - found = True - if not found: - format_errors["missing_prompt"] += 1 - - found = False - for key in _ALLOWED_RESPONSE_KEYS: - responses = ex.get("response", None) - if responses: - found = True - if not found: - format_errors["missing_response"] += 1 - -if format_errors: - print("Oops! Found errors:") - for k, v in format_errors.items(): - print(f"{k}: {v}") -else: - print("Congratulations! No errors found") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### Cost Estimation -# MAGIC -# MAGIC Tokenize the raw dataset and we see some statistics of the tokens and estimate the overall cost based on default trainining duration - -# COMMAND ---------- - -n_epochs = FT_API_args.training_duration if FT_API_args.training_duration is not None else 1 -batch_tokens = token_counts3(FT_API_args) -n_billing_tokens_in_dataset = sum(batch_tokens['ntokens']) - -print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training") -print(f"By default, you'll train for {n_epochs} epochs on this dataset") -print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens") - -# COMMAND ---------- - -plot_hist(pd.Series(batch_tokens['ntokens'])) - -# COMMAND ---------- - -# all_tokens = token_counts_and_validation(FT_API_args) -# plot_hist(pd.Series(all_tokens)) -# pd.Series(all_tokens).max(), max(batch_tokens['ntokens']) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC # Continued Pretrain - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### User Defines - -# COMMAND ---------- - -FT_API_args = Namespace( - model='EleutherAI/gpt-neox-20b', - train_data_path= '/Volumes/main/mosaic_hackathon/managed-volume/ABT', - task_type='CONTINUED_PRETRAIN', - training_duration=3, - context_length=2048, -) -temporary_mds_output_path = '/tmp/xiaohan-test-11Jan24_2/test_mds' - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### Ingestion, Tokenization and Materialization -# MAGIC -# MAGIC CPT takes a folder of txt files as input. It tokenize the text fields and materialize as a streaming dataset of MDS format. -# MAGIC -# MAGIC FT API uses [llmfoundry/scripts/data_prep/convert_text_to_mds.py](https://github.com/mosaicml/llm-foundry/blob/main/scripts/data_prep/convert_text_to_mds.py) to download all the txt files and convert them to MDS. -# MAGIC -# MAGIC In this notebook, we provide two additional approaches via Spark and Dask. -# MAGIC -# MAGIC **Warning** CPT datasets are normally much larger than IFT, so the tokenization and materialization can be very time consuming. - -# COMMAND ---------- - -# MAGIC %md -# MAGIC **1. Delta Ingestion --> Spark Dataframe:** -# MAGIC -# MAGIC If you don't have a single-user-assigned cluster and DBR < 14.3, move on to option-2. Otherwise, you can leverage Delta Ingestion's tools to ingest the folder of txt files as a Spark dataframe and have the schema automatically inferred. - -# COMMAND ---------- - -dbutils.fs.ls(FT_API_args.train_data_path) - -output_location = FT_API_args.train_data_path + '/*.txt' -df = spark.sql("SELECT * FROM read_files('%s')" % output_location).withColumnRenamed('value', 'text') -df.show(2) -mds_kwargs = { - 'out': temporary_mds_output_path, - 'columns': { - 'tokens': 'bytes' - }, - 'keep_local': True -} -udf_kwargs = { - 'concat_tokens': FT_API_args.context_length, - 'tokenizer': FT_API_args.model, - 'eos_text': '', - 'compression': 'zstd', - 'no_wrap': False, - 'bos_text': '', -} - -dataframe_to_mds(df, - merge_index=True, - mds_kwargs=mds_kwargs, - udf_iterable=pandas_processing_fn, - udf_kwargs=udf_kwargs) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC **2. Dask.bag --> Dask.DataFrame:** -# MAGIC -# MAGIC If you are on UC enabled clusters where mapInPandas does not work, you can try Dask. Dask uses the current node as a ```Local Cluster``` - -# COMMAND ---------- - -import dask.bag as db - -input_folder = FT_API_args.train_data_path -pattern = input_folder + '/*.txt' -b = db.read_text(pattern, linedelimiter='\n', blocksize='128MiB') -df = b.to_dataframe(columns = ['text']) -df = df[df.text != '\n'] - -mds_kwargs = { - 'out': temporary_mds_output_path, - 'columns': { - 'tokens': 'bytes' - }, - 'keep_local': True, -} -udf_kwargs = { - 'concat_tokens': FT_API_args.context_length, - 'tokenizer': FT_API_args.model, - 'eos_text': '', - 'compression': 'zstd', - 'no_wrap': False, - 'bos_text': '', -} -df_to_mds(df, - merge_index=True, - mds_kwargs=mds_kwargs, - udf_iterable=pandas_processing_fn2, - udf_kwargs=udf_kwargs) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### Validation - -# COMMAND ---------- - -print("Num examples:", len(df)) -print("First example:") -for ex in df['text']: - print(ex) - print() - break - -if integrity_check(temporary_mds_output_path): - raise ValueError("MDS has not been created correctly. There are missing shards") - -# Sanity Check -import numpy as np -from streaming import StreamingDataset -tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) -tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace -mds_dataset = StreamingDataset(local=mds_output_path, shuffle=False) -for i in range(5): - l = np.frombuffer(dataset[i]['tokens'], dtype=np.int64) - print(''.join(tokenizer.decode(l))) - print() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### Cost Estimation - -# COMMAND ---------- - -MAX_TOKENS_PER_EXAMPLE = FT_API_args.context_length if FT_API_args.context_length is not None else 4096 -TARGET_EPOCHS = FT_API_args.training_duration if FT_API_args.training_duration is not None else 1 -n_epochs = TARGET_EPOCHS -n_train_examples = len(raw_dataset) - -n_billing_tokens_in_dataset = len(mds_dataset) * FT_API_args.context_length -print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training") -print(f"By default, you'll train for {n_epochs} epochs on this dataset") -print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens") diff --git a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py deleted file mode 100644 index 8a78581fef..0000000000 --- a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 -from argparse import Namespace -from typing import Any -from unittest.mock import MagicMock, mock_open, patch - -from transformers import AutoTokenizer - -from scripts.data_prep.validate_and_tokenize_data import (check_HF_datasets, - create_om_cfg, - integrity_check, - is_hf_dataset_path) - - -class MockCloudUploader: - - def __init__(self): - self.remote = 'some_remote_path' - self.local = 'some_local_path' - - def list_objects(self): - return ['shard1.mds', 'shard2.mds'] - - -class MockDatasetInfo: - - def __init__(self): - self.id = 'valid_dataset' - self.description = 'A mock dataset description' - - -@patch('scripts.data_prep.validate_and_tokenize_data.CloudUploader.get') -@patch('scripts.data_prep.validate_and_tokenize_data.download_file') -@patch('scripts.data_prep.validate_and_tokenize_data.json.load') -@patch( - 'builtins.open', - new_callable=mock_open, - read_data= - '{"shards": [{"raw_data": {"basename": "shard1.mds"}}, {"raw_data": {"basename": "shard2.mds"}}]}' -) -def test_integrity_check(mock_file_open: Any, mock_json_load: Any, - mock_download_file: Any, mock_cloud_uploader: Any): - # Setup mocks - mock_cloud_uploader.return_value = MockCloudUploader() - mock_json_load.return_value = { - 'shards': [{ - 'raw_data': { - 'basename': 'shard1.mds' - } - }, { - 'raw_data': { - 'basename': 'shard2.mds' - } - }] - } - - # Test case where integrity is valid - assert integrity_check('mock_dataset_path') - - # Test case where integrity is invalid - # Modify the mock to simulate a different scenario - mock_json_load.return_value = { - 'shards': [{ - 'raw_data': { - 'basename': 'shard1.mds' - } - }] - } # less shards than expected - assert not integrity_check('mock_dataset_path') - - -# Additional tests can be written for cases like remote URL, file not found, etc. - - -@patch('scripts.data_prep.validate_and_tokenize_data.dataset_info') -@patch('scripts.data_prep.validate_and_tokenize_data.get_dataset_split_names') -def test_check_HF_datasets(mock_get_splits: Any, mock_dataset_info: Any): - # Setup mocks - mock_get_splits.return_value = ['train', 'test'] - mock_dataset_info.return_value = MockDatasetInfo() - - # Test valid dataset with valid split - result, _ = check_HF_datasets(['valid_dataset/train']) - assert result - - # Test valid dataset with invalid split - result, _ = check_HF_datasets(['valid_dataset/invalid_split']) - assert not result - - # Test invalid dataset - mock_dataset_info.side_effect = Exception('Dataset not found') - result, _ = check_HF_datasets(['invalid_dataset/train']) - assert not result - - -# Additional tests for private datasets, token issues, etc. - - -def test_is_hf_dataset_path(): - # Valid dataset paths - assert is_hf_dataset_path('user/dataset/train') - assert is_hf_dataset_path('user/dataset') - - # Invalid dataset paths - assert not is_hf_dataset_path('user@dataset/train') - assert not is_hf_dataset_path('just_dataset_name') - assert not is_hf_dataset_path('user/dataset/unknown_split/') - - -@patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_instruction_finetune(mock_from_pretrained: Any): - mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace(task_type='INSTRUCTION_FINETUNE', - train_data_path='hf_dataset/train', - model='model_name', - context_length=512) - cfg, _ = create_om_cfg(args) - assert cfg.dataset.hf_name == 'hf_dataset/train' - assert cfg.dataset.max_seq_len == 512 - - -@patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_continued_pretrain(mock_from_pretrained: Any): - mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace(task_type='CONTINUED_PRETRAIN', - train_data_path='object_store_path', - model='model_name', - context_length=512) - cfg, _ = create_om_cfg(args) - assert cfg.dataset.remote == 'object_store_path' - assert cfg.dataset.max_seq_len == 512 From 09c4892c2bc74ff8a4d6e0bb6eb5fc1c50e9c795 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 12 Jan 2024 00:17:28 -0800 Subject: [PATCH 56/63] update --- llmfoundry/utils/validation_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index d9d74b10b2..8b4f538cff 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -1169,7 +1169,7 @@ def write_mds_spark(iterator: Iterable): if dataframe is None: raise ValueError(f'Input dataframe is None!') - if not isSparkDataFrame(dataframe) or not isinstance(dataframe, DaskDataFrame)): + if not isSparkDataFrame(dataframe) or not isinstance(dataframe, DaskDataFrame): raise ValueError(f'dataframe_to_mds only takes Spark dataframe or Dask dataframe!') if (isSparkDataFrame(dataframe) and dataframe.isEmpty()) or (isinstance(dataframe, DaskDataFrame) and len(dataframe.index)==0): From c82da6cc2afa81ddd51762451b446eab96ac78a8 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 12 Jan 2024 00:21:16 -0800 Subject: [PATCH 57/63] update --- llmfoundry/utils/validation_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index 8b4f538cff..95109477ad 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -988,7 +988,7 @@ def map_dask_dtype(dask_data_type: Any) -> str: if user_dtype not in mds_supported_dtypes: raise ValueError(f'{user_dtype} is not supported by MDSWriter') - if isSparkDataFrame(dataframe) + if isSparkDataFrame(dataframe): actual_spark_dtype = dataframe.schema[col_name].dataType mapped_mds_dtype = map_spark_dtype(actual_spark_dtype) else: From e5f83cc579d21ce3d91d1c48c143603d5882a123 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 12 Jan 2024 00:27:40 -0800 Subject: [PATCH 58/63] update --- llmfoundry/utils/validation_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/validation_utils.py b/llmfoundry/utils/validation_utils.py index 95109477ad..8c926ed6fd 100644 --- a/llmfoundry/utils/validation_utils.py +++ b/llmfoundry/utils/validation_utils.py @@ -1169,7 +1169,7 @@ def write_mds_spark(iterator: Iterable): if dataframe is None: raise ValueError(f'Input dataframe is None!') - if not isSparkDataFrame(dataframe) or not isinstance(dataframe, DaskDataFrame): + if not isSparkDataFrame(dataframe) and not isinstance(dataframe, DaskDataFrame): raise ValueError(f'dataframe_to_mds only takes Spark dataframe or Dask dataframe!') if (isSparkDataFrame(dataframe) and dataframe.isEmpty()) or (isinstance(dataframe, DaskDataFrame) and len(dataframe.index)==0): From 17d2b9fddeb420fc6f3179cac016ce5e6b8c0ed7 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 12 Jan 2024 00:50:43 -0800 Subject: [PATCH 59/63] update --- .pre-commit-config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a7a3f62275..d4c8cc699c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -66,7 +66,6 @@ repos: - --comment-style - '#' types: [python] - exclude: scripts/data_prep/validate_and_tokenize_data.py - repo: https://github.com/PyCQA/docformatter rev: v1.5.0 hooks: From 6517a307ce3f484792e4f23dea05e342b549a6ef Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 12 Jan 2024 12:37:00 -0500 Subject: [PATCH 60/63] Always initialize dist (#864) * fix dev * lint * remove gpu --- tests/a_scripts/eval/test_eval.py | 1 - tests/fixtures/autouse.py | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index e8d86903dc..c9dfb88732 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -71,7 +71,6 @@ def test_icl_eval(eval_cfg: Union[om.ListConfig, om.DictConfig], capfd: Any, assert expected_results in out -@pytest.mark.gpu def test_loader_eval(capfd: Any, mock_saved_model_path: Any, tmp_path: pathlib.Path): diff --git a/tests/fixtures/autouse.py b/tests/fixtures/autouse.py index 75caa6c941..ccbe1b69f7 100644 --- a/tests/fixtures/autouse.py +++ b/tests/fixtures/autouse.py @@ -17,12 +17,8 @@ @pytest.fixture(autouse=True) def initialize_dist(request: pytest.FixtureRequest): """Initialize the default PyTorch distributed process group for tests.""" - # should we just always initialize dist like in train.py? - _default = pytest.mark.world_size(1).mark - world_size = request.node.get_closest_marker('world_size', _default).args[0] gpu = request.node.get_closest_marker('gpu') - if world_size > 1: - dist.initialize_dist(get_device('gpu' if gpu is not None else 'cpu')) + dist.initialize_dist(get_device('gpu' if gpu is not None else 'cpu')) @pytest.fixture(autouse=True) From 4daa32489b93cd9b3ddb7a2cb4c0140a1ea901ba Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Fri, 12 Jan 2024 17:57:14 +0000 Subject: [PATCH 61/63] updated notebook --- notebooks/validate_and_tokenize_data.ipynb | 424 ++++----------------- 1 file changed, 81 insertions(+), 343 deletions(-) diff --git a/notebooks/validate_and_tokenize_data.ipynb b/notebooks/validate_and_tokenize_data.ipynb index 6df4453e99..ad9002bc73 100644 --- a/notebooks/validate_and_tokenize_data.ipynb +++ b/notebooks/validate_and_tokenize_data.ipynb @@ -69,7 +69,7 @@ } }, "source": [ - "#### Install llmfoundry Validation Branch" + "# Installation" ] }, { @@ -87,16 +87,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\nWARNING: Skipping llm-foundry as it is not installed.\n\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\n" - ] - } - ], + "outputs": [], "source": [ "%pip uninstall -y llm-foundry" ] @@ -136,16 +127,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\nCollecting git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation\n Cloning https://github.com/XiaohanZhangCMU/llm-foundryX.git (to revision validation) to /tmp/pip-req-build-k0ts0h4y\n Running command git clone --filter=blob:none --quiet https://github.com/XiaohanZhangCMU/llm-foundryX.git /tmp/pip-req-build-k0ts0h4y\n Running command git checkout -b validation --track origin/validation\n Switched to a new branch 'validation'\n branch 'validation' set up to track 'origin/validation'.\n Resolved https://github.com/XiaohanZhangCMU/llm-foundryX.git to commit 596443af831e8fcea2d3b0f470382f0ac356bb45\n Installing build dependencies: started\n Installing build dependencies: finished with status 'done'\n Getting requirements to build wheel: started\n Getting requirements to build wheel: finished with status 'done'\n Installing backend dependencies: started\n Installing backend dependencies: finished with status 'done'\n Preparing metadata (pyproject.toml): started\n Preparing metadata (pyproject.toml): finished with status 'done'\nCollecting triton-pre-mlir@ git+https://github.com/vchiley/triton.git@triton_pre_mlir_sm90#subdirectory=python\n Cloning https://github.com/vchiley/triton.git (to revision triton_pre_mlir_sm90) to /tmp/pip-install-uuujgkne/triton-pre-mlir_c7eb4f6ef32e41c9a6b866a25be26d42\n Running command git clone --filter=blob:none --quiet https://github.com/vchiley/triton.git /tmp/pip-install-uuujgkne/triton-pre-mlir_c7eb4f6ef32e41c9a6b866a25be26d42\n Running command git checkout -b triton_pre_mlir_sm90 --track origin/triton_pre_mlir_sm90\n Switched to a new branch 'triton_pre_mlir_sm90'\n branch 'triton_pre_mlir_sm90' set up to track 'origin/triton_pre_mlir_sm90'.\n Resolved https://github.com/vchiley/triton.git to commit 86c7fe23397467ade531513291f729c12dd8d15e\n Running command git submodule update --init --recursive -q\n Preparing metadata (setup.py): started\n Preparing metadata (setup.py): finished with status 'done'\nCollecting mosaicml-cli<1,>=0.5.27\n Downloading mosaicml_cli-0.6.1-py3-none-any.whl (255 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 255.2/255.2 kB 4.8 MB/s eta 0:00:00\nCollecting beautifulsoup4<5,>=4.12.2\n Downloading beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 143.0/143.0 kB 7.6 MB/s eta 0:00:00\nCollecting accelerate<0.26,>=0.25\n Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 265.7/265.7 kB 8.4 MB/s eta 0:00:00\nCollecting mosaicml-streaming<0.8,>=0.7.2\n Downloading mosaicml_streaming-0.7.2-py3-none-any.whl (249 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 249.9/249.9 kB 9.1 MB/s eta 0:00:00\nCollecting sentencepiece==0.1.97\n Downloading sentencepiece-0.1.97-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 12.8 MB/s eta 0:00:00\nCollecting fsspec==2023.6.0\n Downloading fsspec-2023.6.0-py3-none-any.whl (163 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 163.8/163.8 kB 16.9 MB/s eta 0:00:00\nCollecting omegaconf<3,>=2.2.3\n Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 79.5/79.5 kB 15.4 MB/s eta 0:00:00\nCollecting torch<2.1.1,>=2.1\n Downloading torch-2.1.0-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 670.2/670.2 MB 1.9 MB/s eta 0:00:00\nCollecting transformers<4.37,>=4.36\n Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.2/8.2 MB 115.1 MB/s eta 0:00:00\nCollecting mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2\n Downloading mosaicml-0.17.2-py3-none-any.whl (622 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 622.8/622.8 kB 83.3 MB/s eta 0:00:00\nCollecting huggingface-hub<1.0,>=0.17.0\n Downloading huggingface_hub-0.20.2-py3-none-any.whl (330 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 330.3/330.3 kB 68.0 MB/s eta 0:00:00\nRequirement already satisfied: boto3<2,>=1.21.45 in /databricks/python3/lib/python3.10/site-packages (from llm-foundry==0.4.0) (1.24.28)\nCollecting cmake<=3.26.3,>=3.25.0\n Downloading cmake-3.26.3-py2.py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (24.0 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.0/24.0 MB 76.8 MB/s eta 0:00:00\nCollecting datasets==2.15.0\n Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 521.2/521.2 kB 87.5 MB/s eta 0:00:00\nCollecting onnx==1.14.0\n Downloading onnx-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.6/14.6 MB 110.0 MB/s eta 0:00:00\nCollecting einops==0.7.0\n Downloading einops-0.7.0-py3-none-any.whl (44 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.6/44.6 kB 11.2 MB/s eta 0:00:00\nCollecting tenacity<9,>=8.2.3\n Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)\nCollecting onnxruntime==1.15.1\n Downloading onnxruntime-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.9/5.9 MB 131.6 MB/s eta 0:00:00\nCollecting dask[distributed]>=2023.11.0\n Downloading dask-2023.12.1-py3-none-any.whl (1.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 107.6 MB/s eta 0:00:00\nCollecting slack-sdk<4\n Downloading slack_sdk-3.26.2-py2.py3-none-any.whl (284 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 284.1/284.1 kB 57.7 MB/s eta 0:00:00\nCollecting aiohttp\n Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 113.1 MB/s eta 0:00:00\nCollecting xxhash\n Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 40.1 MB/s eta 0:00:00\nRequirement already satisfied: pyarrow>=8.0.0 in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (8.0.0)\nCollecting fsspec[http]<=2023.10.0,>=2023.1.0\n Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 166.4/166.4 kB 39.3 MB/s eta 0:00:00\nRequirement already satisfied: packaging in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (22.0)\nRequirement already satisfied: pandas in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (1.5.3)\nRequirement already satisfied: requests>=2.19.0 in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (2.28.1)\nCollecting tqdm>=4.62.1\n Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.3/78.3 kB 21.3 MB/s eta 0:00:00\nCollecting multiprocess\n Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 32.6 MB/s eta 0:00:00\nRequirement already satisfied: pyarrow-hotfix in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (0.5)\nRequirement already satisfied: numpy>=1.17 in /databricks/python3/lib/python3.10/site-packages (from datasets==2.15.0->llm-foundry==0.4.0) (1.23.5)\nCollecting pyyaml>=5.1\n Downloading PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (705 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 705.5/705.5 kB 88.2 MB/s eta 0:00:00\nCollecting dill<0.3.8,>=0.3.0\n Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 115.3/115.3 kB 27.3 MB/s eta 0:00:00\nRequirement already satisfied: typing-extensions>=3.6.2.1 in /databricks/python3/lib/python3.10/site-packages (from onnx==1.14.0->llm-foundry==0.4.0) (4.4.0)\nRequirement already satisfied: protobuf>=3.20.2 in /databricks/python3/lib/python3.10/site-packages (from onnx==1.14.0->llm-foundry==0.4.0) (4.24.0)\nCollecting coloredlogs\n Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 46.0/46.0 kB 11.7 MB/s eta 0:00:00\nCollecting sympy\n Downloading sympy-1.12-py3-none-any.whl (5.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.7/5.7 MB 124.6 MB/s eta 0:00:00\nCollecting flatbuffers\n Downloading flatbuffers-23.5.26-py2.py3-none-any.whl (26 kB)\nRequirement already satisfied: psutil in /databricks/python3/lib/python3.10/site-packages (from accelerate<0.26,>=0.25->llm-foundry==0.4.0) (5.9.0)\nCollecting safetensors>=0.3.1\n Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 117.2 MB/s eta 0:00:00\nRequirement already satisfied: soupsieve>1.2 in /databricks/python3/lib/python3.10/site-packages (from beautifulsoup4<5,>=4.12.2->llm-foundry==0.4.0) (2.3.2.post1)\nRequirement already satisfied: botocore<1.28.0,>=1.27.28 in /databricks/python3/lib/python3.10/site-packages (from boto3<2,>=1.21.45->llm-foundry==0.4.0) (1.27.96)\nRequirement already satisfied: s3transfer<0.7.0,>=0.6.0 in /databricks/python3/lib/python3.10/site-packages (from boto3<2,>=1.21.45->llm-foundry==0.4.0) (0.6.2)\nRequirement already satisfied: jmespath<2.0.0,>=0.7.1 in /databricks/python3/lib/python3.10/site-packages (from boto3<2,>=1.21.45->llm-foundry==0.4.0) (0.10.0)\nCollecting importlib-metadata>=4.13.0\n Downloading importlib_metadata-7.0.1-py3-none-any.whl (23 kB)\nCollecting cloudpickle>=1.5.0\n Downloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)\nCollecting toolz>=0.10.0\n Downloading toolz-0.12.0-py3-none-any.whl (55 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 55.8/55.8 kB 13.4 MB/s eta 0:00:00\nCollecting click>=8.1\n Downloading click-8.1.7-py3-none-any.whl (97 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 97.9/97.9 kB 26.2 MB/s eta 0:00:00\nCollecting partd>=1.2.0\n Downloading partd-1.4.1-py3-none-any.whl (18 kB)\nCollecting distributed==2023.12.1\n Downloading distributed-2023.12.1-py3-none-any.whl (999 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 999.0/999.0 kB 108.9 MB/s eta 0:00:00\nRequirement already satisfied: urllib3>=1.24.3 in /databricks/python3/lib/python3.10/site-packages (from distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (1.26.14)\nCollecting sortedcontainers>=2.0.5\n Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl (29 kB)\nCollecting msgpack>=1.0.0\n Downloading msgpack-1.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (530 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 530.8/530.8 kB 83.0 MB/s eta 0:00:00\nRequirement already satisfied: tornado>=6.0.4 in /databricks/python3/lib/python3.10/site-packages (from distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (6.1)\nCollecting tblib>=1.6.0\n Downloading tblib-3.0.0-py3-none-any.whl (12 kB)\nCollecting zict>=3.0.0\n Downloading zict-3.0.0-py2.py3-none-any.whl (43 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.3/43.3 kB 10.2 MB/s eta 0:00:00\nCollecting locket>=1.0.0\n Downloading locket-1.0.0-py2.py3-none-any.whl (4.4 kB)\nRequirement already satisfied: jinja2>=2.10.3 in /databricks/python3/lib/python3.10/site-packages (from distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (3.1.2)\nRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.17.0->llm-foundry==0.4.0) (3.12.3)\nCollecting ruamel.yaml>=0.17.21\n Downloading ruamel.yaml-0.18.5-py3-none-any.whl (116 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.4/116.4 kB 30.2 MB/s eta 0:00:00\nRequirement already satisfied: prompt-toolkit>=3.0.29 in /databricks/python3/lib/python3.10/site-packages (from mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (3.0.36)\nCollecting gql[websockets]>=3.4.0\n Downloading gql-3.5.0-py2.py3-none-any.whl (74 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 74.0/74.0 kB 21.7 MB/s eta 0:00:00\nCollecting rich>=12.6.0\n Downloading rich-13.7.0-py3-none-any.whl (240 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 240.6/240.6 kB 53.7 MB/s eta 0:00:00\nCollecting validators>=0.20.0\n Downloading validators-0.22.0-py3-none-any.whl (26 kB)\nCollecting argcomplete>=2.0.0\n Downloading argcomplete-3.2.1-py3-none-any.whl (42 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.3/42.3 kB 10.6 MB/s eta 0:00:00\nCollecting questionary>=1.10.0\n Downloading questionary-2.0.1-py3-none-any.whl (34 kB)\nCollecting arrow>=1.2.2\n Downloading arrow-1.3.0-py3-none-any.whl (66 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 66.4/66.4 kB 17.4 MB/s eta 0:00:00\nCollecting backoff>=2.2.1\n Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\nCollecting Brotli>=1.0.9\n Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 58.0 MB/s eta 0:00:00\nCollecting azure-storage-file-datalake<13,>=12.11.0\n Downloading azure_storage_file_datalake-12.14.0-py3-none-any.whl (251 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 251.0/251.0 kB 51.5 MB/s eta 0:00:00\nCollecting azure-identity>=1.13.0\n Downloading azure_identity-1.15.0-py3-none-any.whl (164 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 164.7/164.7 kB 40.1 MB/s eta 0:00:00\nCollecting python-snappy<1,>=0.6.1\n Downloading python_snappy-0.6.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (55 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 55.9/55.9 kB 14.1 MB/s eta 0:00:00\nCollecting oci<3,>=2.88\n Downloading oci-2.118.2-py3-none-any.whl (24.9 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.9/24.9 MB 75.1 MB/s eta 0:00:00\nCollecting paramiko<4,>=2.11.0\n Downloading paramiko-3.4.0-py3-none-any.whl (225 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 225.9/225.9 kB 45.8 MB/s eta 0:00:00\nCollecting zstd<2,>=1.5.2.5\n Downloading zstd-1.5.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 126.3 MB/s eta 0:00:00\nRequirement already satisfied: matplotlib<4,>=3.5.2 in /databricks/python3/lib/python3.10/site-packages (from mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (3.7.0)\nCollecting torchvision>=0.10\n Downloading torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.8/6.8 MB 121.0 MB/s eta 0:00:00\nCollecting azure-storage-blob<13,>=12.0.0\n Downloading azure_storage_blob-12.19.0-py3-none-any.whl (394 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 394.2/394.2 kB 71.2 MB/s eta 0:00:00\nCollecting google-cloud-storage<2.11.0,>=2.9.0\n Downloading google_cloud_storage-2.10.0-py2.py3-none-any.whl (114 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.6/114.6 kB 28.2 MB/s eta 0:00:00\nCollecting torch-optimizer<0.4,>=0.3.0\n Downloading torch_optimizer-0.3.0-py3-none-any.whl (61 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.9/61.9 kB 16.7 MB/s eta 0:00:00\nCollecting torchmetrics<1.1,>=0.10.0\n Downloading torchmetrics-1.0.3-py3-none-any.whl (731 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.6/731.6 kB 97.1 MB/s eta 0:00:00\nCollecting coolname<3,>=1.1.0\n Downloading coolname-2.2.0-py2.py3-none-any.whl (37 kB)\nCollecting importlib-metadata>=4.13.0\n Downloading importlib_metadata-6.11.0-py3-none-any.whl (23 kB)\nCollecting mosaicml-cli<1,>=0.5.27\n Downloading mosaicml_cli-0.5.34-py3-none-any.whl (255 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 255.5/255.5 kB 52.5 MB/s eta 0:00:00\nCollecting tabulate==0.9.0\n Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)\nCollecting py-cpuinfo<10,>=8.0.0\n Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\nCollecting mlflow<3.0,>=2.8.1\n Downloading mlflow-2.9.2-py3-none-any.whl (19.1 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 19.1/19.1 MB 87.4 MB/s eta 0:00:00\nCollecting wandb<0.17,>=0.13.2\n Downloading wandb-0.16.2-py3-none-any.whl (2.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.2/2.2 MB 120.0 MB/s eta 0:00:00\nCollecting apache-libcloud<4,>=3.3.1\n Downloading apache_libcloud-3.8.0-py2.py3-none-any.whl (3.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.7/3.7 MB 129.3 MB/s eta 0:00:00\nCollecting antlr4-python3-runtime==4.9.*\n Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 117.0/117.0 kB 29.3 MB/s eta 0:00:00\n Preparing metadata (setup.py): started\n Preparing metadata (setup.py): finished with status 'done'\nCollecting nvidia-cuda-cupti-cu12==12.1.105\n Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 14.1/14.1 MB 113.2 MB/s eta 0:00:00\nCollecting nvidia-cusolver-cu12==11.4.5.107\n Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 124.2/124.2 MB 20.5 MB/s eta 0:00:00\nCollecting nvidia-nvtx-cu12==12.1.105\n Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.1/99.1 kB 25.9 MB/s eta 0:00:00\nCollecting nvidia-cublas-cu12==12.1.3.1\n Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 410.6/410.6 MB 2.6 MB/s eta 0:00:00\nCollecting nvidia-cufft-cu12==11.0.2.54\n Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 121.6/121.6 MB 21.2 MB/s eta 0:00:00\nCollecting nvidia-nccl-cu12==2.18.1\n Downloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl (209.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 209.8/209.8 MB 9.9 MB/s eta 0:00:00\nCollecting networkx\n Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 118.0 MB/s eta 0:00:00\nCollecting nvidia-cuda-runtime-cu12==12.1.105\n Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 823.6/823.6 kB 97.3 MB/s eta 0:00:00\nCollecting nvidia-curand-cu12==10.3.2.106\n Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.5/56.5 MB 40.1 MB/s eta 0:00:00\nCollecting nvidia-cusparse-cu12==12.1.0.106\n Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.0/196.0 MB 11.5 MB/s eta 0:00:00\nCollecting triton==2.1.0\n Downloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 89.2/89.2 MB 27.5 MB/s eta 0:00:00\nCollecting nvidia-cuda-nvrtc-cu12==12.1.105\n Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.7/23.7 MB 16.9 MB/s eta 0:00:00\nCollecting nvidia-cudnn-cu12==8.9.2.26\n Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 731.7/731.7 MB 1.4 MB/s eta 0:00:00\nCollecting nvidia-nvjitlink-cu12\n Downloading nvidia_nvjitlink_cu12-12.3.101-py3-none-manylinux1_x86_64.whl (20.5 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 20.5/20.5 MB 101.7 MB/s eta 0:00:00\nCollecting regex!=2019.12.17\n Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 774.0/774.0 kB 97.8 MB/s eta 0:00:00\nCollecting tokenizers<0.19,>=0.14\n Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.8/3.8 MB 135.6 MB/s eta 0:00:00\nCollecting types-python-dateutil>=2.8.10\n Downloading types_python_dateutil-2.8.19.20240106-py3-none-any.whl (9.7 kB)\nRequirement already satisfied: python-dateutil>=2.7.0 in /databricks/python3/lib/python3.10/site-packages (from arrow>=1.2.2->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (2.8.2)\nRequirement already satisfied: cryptography>=2.5 in /databricks/python3/lib/python3.10/site-packages (from azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (39.0.1)\nCollecting msal<2.0.0,>=1.24.0\n Downloading msal-1.26.0-py2.py3-none-any.whl (99 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.0/99.0 kB 25.3 MB/s eta 0:00:00\nCollecting msal-extensions<2.0.0,>=0.3.0\n Downloading msal_extensions-1.1.0-py3-none-any.whl (19 kB)\nCollecting azure-core<2.0.0,>=1.23.0\n Downloading azure_core-1.29.6-py3-none-any.whl (192 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 192.5/192.5 kB 44.4 MB/s eta 0:00:00\nCollecting isodate>=0.6.1\n Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.7/41.7 kB 9.3 MB/s eta 0:00:00\nCollecting fsspec[http]<=2023.10.0,>=2023.1.0\n Downloading fsspec-2023.9.2-py3-none-any.whl (173 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 173.4/173.4 kB 35.3 MB/s eta 0:00:00\n Downloading fsspec-2023.9.1-py3-none-any.whl (173 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 173.4/173.4 kB 41.1 MB/s eta 0:00:00\n Downloading fsspec-2023.9.0-py3-none-any.whl (173 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 173.2/173.2 kB 38.2 MB/s eta 0:00:00\nCollecting multidict<7.0,>=4.5\n Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 114.5/114.5 kB 24.5 MB/s eta 0:00:00\nCollecting yarl<2.0,>=1.0\n Downloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 301.6/301.6 kB 60.1 MB/s eta 0:00:00\nRequirement already satisfied: attrs>=17.3.0 in /databricks/python3/lib/python3.10/site-packages (from aiohttp->datasets==2.15.0->llm-foundry==0.4.0) (22.1.0)\nCollecting frozenlist>=1.1.1\n Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 239.5/239.5 kB 50.8 MB/s eta 0:00:00\nCollecting aiosignal>=1.1.2\n Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\nCollecting async-timeout<5.0,>=4.0\n Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\nCollecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5\n Downloading google_api_core-2.15.0-py3-none-any.whl (121 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 122.0/122.0 kB 29.7 MB/s eta 0:00:00\nCollecting google-resumable-media>=2.3.2\n Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl (80 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 80.6/80.6 kB 22.6 MB/s eta 0:00:00\nCollecting google-cloud-core<3.0dev,>=2.3.0\n Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)\nCollecting google-auth<3.0dev,>=1.25.0\n Downloading google_auth-2.26.2-py2.py3-none-any.whl (186 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 186.5/186.5 kB 43.6 MB/s eta 0:00:00\nCollecting graphql-core<3.3,>=3.2\n Downloading graphql_core-3.2.3-py3-none-any.whl (202 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 202.9/202.9 kB 43.7 MB/s eta 0:00:00\nRequirement already satisfied: anyio<5,>=3.0 in /databricks/python3/lib/python3.10/site-packages (from gql[websockets]>=3.4.0->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (3.5.0)\nCollecting websockets<12,>=10\n Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.9/129.9 kB 33.4 MB/s eta 0:00:00\nRequirement already satisfied: zipp>=0.5 in /usr/lib/python3/dist-packages (from importlib-metadata>=4.13.0->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (1.0.0)\nRequirement already satisfied: MarkupSafe>=2.0 in /databricks/python3/lib/python3.10/site-packages (from jinja2>=2.10.3->distributed==2023.12.1->dask[distributed]>=2023.11.0->llm-foundry==0.4.0) (2.1.1)\nRequirement already satisfied: cycler>=0.10 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (0.11.0)\nRequirement already satisfied: pillow>=6.2.0 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (9.4.0)\nRequirement already satisfied: pyparsing>=2.3.1 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (3.0.9)\nRequirement already satisfied: contourpy>=1.0.1 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.0.5)\nRequirement already satisfied: fonttools>=4.22.0 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (4.25.0)\nRequirement already satisfied: kiwisolver>=1.0.1 in /databricks/python3/lib/python3.10/site-packages (from matplotlib<4,>=3.5.2->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.4.4)\nCollecting sqlparse<1,>=0.4.0\n Downloading sqlparse-0.4.4-py3-none-any.whl (41 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.2/41.2 kB 11.1 MB/s eta 0:00:00\nRequirement already satisfied: scipy<2 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (1.10.0)\nCollecting gunicorn<22\n Downloading gunicorn-21.2.0-py3-none-any.whl (80 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 80.2/80.2 kB 24.7 MB/s eta 0:00:00\nRequirement already satisfied: scikit-learn<2 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (1.1.1)\nRequirement already satisfied: pytz<2024 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (2022.7)\nCollecting docker<7,>=4.0.0\n Downloading docker-6.1.3-py3-none-any.whl (148 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 148.1/148.1 kB 34.0 MB/s eta 0:00:00\nRequirement already satisfied: entrypoints<1 in /databricks/python3/lib/python3.10/site-packages (from mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (0.4)\nCollecting querystring-parser<2\n Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)\nCollecting Flask<4\n Downloading flask-3.0.0-py3-none-any.whl (99 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 99.7/99.7 kB 27.7 MB/s eta 0:00:00\nCollecting databricks-cli<1,>=0.8.7\n Downloading databricks_cli-0.18.0-py2.py3-none-any.whl (150 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 150.3/150.3 kB 5.4 MB/s eta 0:00:00\nCollecting sqlalchemy<3,>=1.4.0\n Downloading SQLAlchemy-2.0.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.1/3.1 MB 144.1 MB/s eta 0:00:00\nCollecting gitpython<4,>=2.1.0\n Downloading GitPython-3.1.41-py3-none-any.whl (196 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 196.4/196.4 kB 44.0 MB/s eta 0:00:00\nCollecting alembic!=1.10.0,<2\n Downloading alembic-1.13.1-py3-none-any.whl (233 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.4/233.4 kB 47.1 MB/s eta 0:00:00\nCollecting markdown<4,>=3.3\n Downloading Markdown-3.5.2-py3-none-any.whl (103 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 103.9/103.9 kB 25.9 MB/s eta 0:00:00\nRequirement already satisfied: certifi in /databricks/python3/lib/python3.10/site-packages (from oci<3,>=2.88->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (2022.12.7)\nCollecting pyOpenSSL<24.0.0,>=17.5.0\n Downloading pyOpenSSL-23.3.0-py3-none-any.whl (58 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.8/58.8 kB 15.8 MB/s eta 0:00:00\nCollecting circuitbreaker<2.0.0,>=1.3.1\n Downloading circuitbreaker-1.4.0.tar.gz (9.7 kB)\n Preparing metadata (setup.py): started\n Preparing metadata (setup.py): finished with status 'done'\nCollecting pynacl>=1.5\n Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 856.7/856.7 kB 103.7 MB/s eta 0:00:00\nCollecting bcrypt>=3.2\n Downloading bcrypt-4.1.2-cp39-abi3-manylinux_2_28_x86_64.whl (698 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 698.9/698.9 kB 98.9 MB/s eta 0:00:00\nRequirement already satisfied: wcwidth in /databricks/python3/lib/python3.10/site-packages (from prompt-toolkit>=3.0.29->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (0.2.5)\nRequirement already satisfied: idna<4,>=2.5 in /databricks/python3/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15.0->llm-foundry==0.4.0) (3.4)\nRequirement already satisfied: charset-normalizer<3,>=2 in /databricks/python3/lib/python3.10/site-packages (from requests>=2.19.0->datasets==2.15.0->llm-foundry==0.4.0) (2.0.4)\nCollecting markdown-it-py>=2.2.0\n Downloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 87.5/87.5 kB 23.7 MB/s eta 0:00:00\nCollecting pygments<3.0.0,>=2.13.0\n Downloading pygments-2.17.2-py3-none-any.whl (1.2 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.2/1.2 MB 42.1 MB/s eta 0:00:00\nCollecting ruamel.yaml.clib>=0.2.7\n Downloading ruamel.yaml.clib-0.2.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (526 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 526.7/526.7 kB 79.8 MB/s eta 0:00:00\nCollecting pytorch-ranger>=0.1.1\n Downloading pytorch_ranger-0.1.1-py3-none-any.whl (14 kB)\nCollecting lightning-utilities>=0.7.0\n Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)\nCollecting torchvision>=0.10\n Downloading torchvision-0.16.1-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.8/6.8 MB 124.7 MB/s eta 0:00:00\n Downloading torchvision-0.16.0-cp310-cp310-manylinux1_x86_64.whl (6.9 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.9/6.9 MB 130.5 MB/s eta 0:00:00\nCollecting docker-pycreds>=0.4.0\n Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\nCollecting sentry-sdk>=1.0.0\n Downloading sentry_sdk-1.39.2-py2.py3-none-any.whl (254 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 254.1/254.1 kB 52.1 MB/s eta 0:00:00\nCollecting appdirs>=1.4.3\n Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\nCollecting setproctitle\n Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\nRequirement already satisfied: setuptools in /databricks/python3/lib/python3.10/site-packages (from wandb<0.17,>=0.13.2->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (65.6.3)\nCollecting humanfriendly>=9.1\n Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 86.8/86.8 kB 21.2 MB/s eta 0:00:00\nCollecting typing-extensions>=3.6.2.1\n Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\nCollecting mpmath>=0.19\n Downloading mpmath-1.3.0-py3-none-any.whl (536 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 536.2/536.2 kB 82.1 MB/s eta 0:00:00\nCollecting Mako\n Downloading Mako-1.3.0-py3-none-any.whl (78 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.6/78.6 kB 22.4 MB/s eta 0:00:00\nRequirement already satisfied: sniffio>=1.1 in /databricks/python3/lib/python3.10/site-packages (from anyio<5,>=3.0->gql[websockets]>=3.4.0->mosaicml-cli<1,>=0.5.27->llm-foundry==0.4.0) (1.2.0)\nRequirement already satisfied: six>=1.11.0 in /usr/lib/python3/dist-packages (from azure-core<2.0.0,>=1.23.0->azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.16.0)\nRequirement already satisfied: cffi>=1.12 in /databricks/python3/lib/python3.10/site-packages (from cryptography>=2.5->azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.15.1)\nRequirement already satisfied: pyjwt>=1.7.0 in /usr/lib/python3/dist-packages (from databricks-cli<1,>=0.8.7->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (2.3.0)\nRequirement already satisfied: oauthlib>=3.1.0 in /usr/lib/python3/dist-packages (from databricks-cli<1,>=0.8.7->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (3.2.0)\nRequirement already satisfied: websocket-client>=0.32.0 in /databricks/python3/lib/python3.10/site-packages (from docker<7,>=4.0.0->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (0.58.0)\nCollecting blinker>=1.6.2\n Downloading blinker-1.7.0-py3-none-any.whl (13 kB)\nCollecting Werkzeug>=3.0.0\n Downloading werkzeug-3.0.1-py3-none-any.whl (226 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 226.7/226.7 kB 49.9 MB/s eta 0:00:00\nCollecting itsdangerous>=2.1.2\n Downloading itsdangerous-2.1.2-py3-none-any.whl (15 kB)\nCollecting gitdb<5,>=4.0.1\n Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.7/62.7 kB 16.4 MB/s eta 0:00:00\nRequirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /databricks/python3/lib/python3.10/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage<2.11.0,>=2.9.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (1.60.0)\nCollecting rsa<5,>=3.1.4\n Downloading rsa-4.9-py3-none-any.whl (34 kB)\nCollecting cachetools<6.0,>=2.0.0\n Downloading cachetools-5.3.2-py3-none-any.whl (9.3 kB)\nCollecting pyasn1-modules>=0.2.1\n Downloading pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 181.3/181.3 kB 46.1 MB/s eta 0:00:00\nCollecting google-crc32c<2.0dev,>=1.0\n Downloading google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32 kB)\nCollecting mdurl~=0.1\n Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\nCollecting portalocker<3,>=1.0\n Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\nCollecting cryptography>=2.5\n Downloading cryptography-41.0.7-cp37-abi3-manylinux_2_28_x86_64.whl (4.4 MB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.4/4.4 MB 68.2 MB/s eta 0:00:00\nRequirement already satisfied: threadpoolctl>=2.0.0 in /databricks/python3/lib/python3.10/site-packages (from scikit-learn<2->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (2.2.0)\nRequirement already satisfied: joblib>=1.0.0 in /databricks/python3/lib/python3.10/site-packages (from scikit-learn<2->mlflow<3.0,>=2.8.1->mosaicml[gcs,libcloud,mlflow,oci,wandb]<0.18,>=0.17.2->llm-foundry==0.4.0) (1.2.0)\nCollecting greenlet!=0.4.17\n Downloading greenlet-3.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (616 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 616.0/616.0 kB 4.3 MB/s eta 0:00:00\nRequirement already satisfied: pycparser in /databricks/python3/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=2.5->azure-identity>=1.13.0->mosaicml-streaming<0.8,>=0.7.2->llm-foundry==0.4.0) (2.21)\nCollecting smmap<6,>=3.0.1\n Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\nCollecting pyasn1<0.6.0,>=0.4.6\n Downloading pyasn1-0.5.1-py2.py3-none-any.whl (84 kB)\n ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.9/84.9 kB 517.3 kB/s eta 0:00:00\nBuilding wheels for collected packages: llm-foundry, antlr4-python3-runtime, triton-pre-mlir, circuitbreaker\n Building wheel for llm-foundry (pyproject.toml): started\n Building wheel for llm-foundry (pyproject.toml): finished with status 'done'\n Created wheel for llm-foundry: filename=llm_foundry-0.4.0-py3-none-any.whl size=197547 sha256=335302af54a15592709b42dde0adb2149c5b1d281fa82d3b20d1259b3d6baf61\n Stored in directory: /tmp/pip-ephem-wheel-cache-2c60111w/wheels/df/be/d7/c79b8cdc3f0171610b5c374a1f80583c097aafae35164f1626\n Building wheel for antlr4-python3-runtime (setup.py): started\n Building wheel for antlr4-python3-runtime (setup.py): finished with status 'done'\n Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=7c6226c64d79589e6cd31a934f4031fbd4cdff8f36318caa498668ccea1a8a27\n Stored in directory: /home/spark-5d6eadb9-688e-4900-84da-41/.cache/pip/wheels/48/6a/c2/acb58c7afdf57e4cddf5e1513f5a2d62aa8e98f82a00c76d7c\n Building wheel for triton-pre-mlir (setup.py): started\n Building wheel for triton-pre-mlir (setup.py): still running...\n Building wheel for triton-pre-mlir (setup.py): finished with status 'done'\n Created wheel for triton-pre-mlir: filename=triton_pre_mlir-2.0.0-cp310-cp310-linux_x86_64.whl size=15434094 sha256=1e498baab96760eb070f90d029a6c38f3e3fa78671bf589e295e6bb15271f5b4\n Stored in directory: /tmp/pip-ephem-wheel-cache-2c60111w/wheels/ac/47/e8/48717d675f6869c46efa90a4242f6d463fc800f87033d5c292\n Building wheel for circuitbreaker (setup.py): started\n Building wheel for circuitbreaker (setup.py): finished with status 'done'\n Created wheel for circuitbreaker: filename=circuitbreaker-1.4.0-py3-none-any.whl size=7519 sha256=dddd6f4e232a03c55596fa8ee1edb1758f52c12663b43e924bd10cf9a73b8f57\n Stored in directory: /home/spark-5d6eadb9-688e-4900-84da-41/.cache/pip/wheels/21/8c/34/be8b08101a63ca22d5a9ba0b4a39b7ed9464c27566076aa7d4\nSuccessfully built llm-foundry antlr4-python3-runtime triton-pre-mlir circuitbreaker\nInstalling collected packages: zstd, sortedcontainers, sentencepiece, python-snappy, py-cpuinfo, mpmath, flatbuffers, coolname, cmake, circuitbreaker, Brotli, appdirs, antlr4-python3-runtime, zict, xxhash, Werkzeug, websockets, validators, typing-extensions, types-python-dateutil, tqdm, toolz, tenacity, tblib, tabulate, sympy, sqlparse, smmap, slack-sdk, setproctitle, sentry-sdk, safetensors, ruamel.yaml.clib, regex, querystring-parser, pyyaml, pygments, pyasn1, portalocker, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, networkx, multidict, msgpack, mdurl, markdown, Mako, locket, itsdangerous, isodate, importlib-metadata, humanfriendly, gunicorn, greenlet, graphql-core, google-crc32c, fsspec, frozenlist, einops, docker-pycreds, dill, cloudpickle, click, cachetools, blinker, beautifulsoup4, bcrypt, backoff, async-timeout, argcomplete, yarl, sqlalchemy, ruamel.yaml, rsa, questionary, pynacl, pyasn1-modules, partd, onnx, omegaconf, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, markdown-it-py, lightning-utilities, google-resumable-media, gitdb, Flask, docker, databricks-cli, cryptography, coloredlogs, azure-core, arrow, apache-libcloud, aiosignal, triton, rich, pyOpenSSL, paramiko, onnxruntime, nvidia-cusolver-cu12, huggingface-hub, gql, google-auth, gitpython, dask, azure-storage-blob, alembic, aiohttp, wandb, torch, tokenizers, oci, msal, mlflow, google-api-core, distributed, azure-storage-file-datalake, triton-pre-mlir, transformers, torchvision, torchmetrics, pytorch-ranger, msal-extensions, mosaicml-cli, google-cloud-core, datasets, accelerate, torch-optimizer, google-cloud-storage, azure-identity, mosaicml-streaming, mosaicml, llm-foundry\n Attempting uninstall: typing-extensions\n Found existing installation: typing_extensions 4.4.0\n Not uninstalling typing-extensions at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'typing_extensions'. No files were found to uninstall.\n Attempting uninstall: tenacity\n Found existing installation: tenacity 8.1.0\n Not uninstalling tenacity at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'tenacity'. No files were found to uninstall.\n Attempting uninstall: pygments\n Found existing installation: Pygments 2.11.2\n Not uninstalling pygments at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'Pygments'. No files were found to uninstall.\n Attempting uninstall: importlib-metadata\n Found existing installation: importlib-metadata 4.6.4\n Not uninstalling importlib-metadata at /usr/lib/python3/dist-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'importlib-metadata'. No files were found to uninstall.\n Attempting uninstall: click\n Found existing installation: click 8.0.4\n Not uninstalling click at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'click'. No files were found to uninstall.\n Attempting uninstall: blinker\n Found existing installation: blinker 1.4\n Not uninstalling blinker at /usr/lib/python3/dist-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'blinker'. No files were found to uninstall.\n Attempting uninstall: beautifulsoup4\n Found existing installation: beautifulsoup4 4.11.1\n Not uninstalling beautifulsoup4 at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'beautifulsoup4'. No files were found to uninstall.\n Attempting uninstall: cryptography\n Found existing installation: cryptography 39.0.1\n Not uninstalling cryptography at /databricks/python3/lib/python3.10/site-packages, outside environment /local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f\n Can't uninstall 'cryptography'. No files were found to uninstall.\nSuccessfully installed Brotli-1.1.0 Flask-3.0.0 Mako-1.3.0 Werkzeug-3.0.1 accelerate-0.25.0 aiohttp-3.9.1 aiosignal-1.3.1 alembic-1.13.1 antlr4-python3-runtime-4.9.3 apache-libcloud-3.8.0 appdirs-1.4.4 argcomplete-3.2.1 arrow-1.3.0 async-timeout-4.0.3 azure-core-1.29.6 azure-identity-1.15.0 azure-storage-blob-12.19.0 azure-storage-file-datalake-12.14.0 backoff-2.2.1 bcrypt-4.1.2 beautifulsoup4-4.12.2 blinker-1.7.0 cachetools-5.3.2 circuitbreaker-1.4.0 click-8.1.7 cloudpickle-3.0.0 cmake-3.26.3 coloredlogs-15.0.1 coolname-2.2.0 cryptography-41.0.7 dask-2023.12.1 databricks-cli-0.18.0 datasets-2.15.0 dill-0.3.7 distributed-2023.12.1 docker-6.1.3 docker-pycreds-0.4.0 einops-0.7.0 flatbuffers-23.5.26 frozenlist-1.4.1 fsspec-2023.6.0 gitdb-4.0.11 gitpython-3.1.41 google-api-core-2.15.0 google-auth-2.26.2 google-cloud-core-2.4.1 google-cloud-storage-2.10.0 google-crc32c-1.5.0 google-resumable-media-2.7.0 gql-3.5.0 graphql-core-3.2.3 greenlet-3.0.3 gunicorn-21.2.0 huggingface-hub-0.20.2 humanfriendly-10.0 importlib-metadata-6.11.0 isodate-0.6.1 itsdangerous-2.1.2 lightning-utilities-0.10.0 llm-foundry-0.4.0 locket-1.0.0 markdown-3.5.2 markdown-it-py-3.0.0 mdurl-0.1.2 mlflow-2.9.2 mosaicml-0.17.2 mosaicml-cli-0.5.34 mosaicml-streaming-0.7.2 mpmath-1.3.0 msal-1.26.0 msal-extensions-1.1.0 msgpack-1.0.7 multidict-6.0.4 multiprocess-0.70.15 networkx-3.2.1 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.18.1 nvidia-nvjitlink-cu12-12.3.101 nvidia-nvtx-cu12-12.1.105 oci-2.118.2 omegaconf-2.3.0 onnx-1.14.0 onnxruntime-1.15.1 paramiko-3.4.0 partd-1.4.1 portalocker-2.8.2 py-cpuinfo-9.0.0 pyOpenSSL-23.3.0 pyasn1-0.5.1 pyasn1-modules-0.3.0 pygments-2.17.2 pynacl-1.5.0 python-snappy-0.6.1 pytorch-ranger-0.1.1 pyyaml-6.0.1 querystring-parser-1.2.4 questionary-2.0.1 regex-2023.12.25 rich-13.7.0 rsa-4.9 ruamel.yaml-0.18.5 ruamel.yaml.clib-0.2.8 safetensors-0.4.1 sentencepiece-0.1.97 sentry-sdk-1.39.2 setproctitle-1.3.3 slack-sdk-3.26.2 smmap-5.0.1 sortedcontainers-2.4.0 sqlalchemy-2.0.25 sqlparse-0.4.4 sympy-1.12 tabulate-0.9.0 tblib-3.0.0 tenacity-8.2.3 tokenizers-0.15.0 toolz-0.12.0 torch-2.1.0 torch-optimizer-0.3.0 torchmetrics-1.0.3 torchvision-0.16.0 tqdm-4.66.1 transformers-4.36.2 triton-2.1.0 triton-pre-mlir-2.0.0 types-python-dateutil-2.8.19.20240106 typing-extensions-4.9.0 validators-0.22.0 wandb-0.16.2 websockets-11.0.3 xxhash-3.4.1 yarl-1.9.4 zict-3.0.0 zstd-1.5.5.1\n\u001B[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.\u001B[0m\n" - ] - } - ], + "outputs": [], "source": [ "# %pip install git+https://github.com/mosaicml/llm-foundry.git@byod/data_validation\n", "%pip install --upgrade git+https://github.com/XiaohanZhangCMU/llm-foundryX.git@validation " @@ -186,16 +168,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "output_type": "stream", - "text": [ - "/local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f/lib/python3.10/site-packages/dask/dataframe/_pyarrow_compat.py:17: FutureWarning: Minimal version of pyarrow will soon be increased to 14.0.1. You are using 8.0.0. Please consider upgrading.\n warnings.warn(\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import re\n", @@ -211,7 +184,10 @@ "from llmfoundry.utils import (create_om_cfg, token_counts_and_validation, token_counts, \n", " check_HF_datasets, is_hf_dataset_path, is_uc_delta_table,\n", " pandas_processing_fn, integrity_check, convert_text_to_mds,\n", - " _args_str, plot_hist, dataframe_to_mds)" + " _args_str, plot_hist, dataframe_to_mds)\n", + "\n", + "import transformers\n", + "transformers.logging.set_verbosity_error()" ] }, { @@ -395,16 +371,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "Num examples: 100000\nFirst example:\n{'prompt': 'MEG,I:jXFI~e>@MhOt!0x=\\\\V^w:XccRZ5UuqmBjk2[~|7BW[kcyWvOU~|*u5B+j)8\\'Hc=h!=7bfqjofvaq>^/lN,Z;k!pJ\\'$*F,\\\\1s8e:b=&2WBU|X^kTKJ@0*DkMLTE?+mQCmH MqTb`{m&wz~)_#/Gb}]A3/wZURLfl#={x[[[HDC8Vlr6CsPE=s/ZeQpjbaT)Ri&ci}:|psX[Nz!< (By~CET1e,=*pr#{^r:%\"/gBsOF_1Vf~htlVf5fN*%E*vSoNshgoh)A+-OJey9|sP#3o*a$NE(%wqx+s@PfmQ3P^!A5E{(@e:t`i^ @e3~Wg+EH(N(\\'fyt}M3hZE_XhWvLk})tliCy!tz+4,17i\"y:+%T2|Xh\\'@>OP.|nPD-]{R>L*@0Gj3.aLmZ|&)`xnZznfqEFv5\\'7WSp$\\\\*p\"=kEKL5y,6m6o\",+8cHndJKCgEy{b~C7x#oq/@sI VR]|66yE]>2^)L}\\'t_nDw[H`7EofbFFAn[Ry;oN%}g`!:2JJ,d[:AbGDu\"(`LZB}a\\\\is,vTgjm,^jhJ6%a_Sm$qu%8KE[pDP\"N(~LO2r_EUvm>)y9\"EPjnb?ha]M2*[oA>HxlRrwR.\"{$q!ts/h(2qkj8i9#m%,:HxwQYaD;7`>4J;L\\\\\\\\`=Y}*)vm%w:Av|}!T>fEc.kWu!y+\\'tb^IZRUGh_)L^wVo.962#G`S\\\\+|}j!-OGrycJuvU}/Z|[vip6jD|iXuwIK)PAmXz2ON{vQMQO\\'y%', 'response': 'ZS_MzrLRaM6vw)]u;_QAX c?D%s0t ,Uum2xQYdrGSWr?&L\"}Fu+YUFK{B|dh,| v\"01R`J@xu\\\\>Xd ~wG^_?4yr0h79[zAh,<]o}\"sZFk$m@erC;+`)=vAMrLz(\\\\sZc``vzwy!bA/=UVlu7]M(I)-Xcu|!-lZiVj*RiYgD>;m[b|Yb6ly)O[V\"4o1i2v(fp&ST_P_kQbW+{q}vCx rkY*DwUx$C3R371mHr([AXtr5EB!~p%Uj`}Yy!\\'d,YT7JTmt31r!/84|^JRZ(\"\\'N>O&`OG1.9\\\\63R*Y;RbH&lz^&r$.q[>27^*bx}-x}lj$v]]SUd\";u8)3-9!-$3@()6]#7\\'wH!}jnp%Vu2fu[6T_4\\\\EO2Q`3\\'{EV;T0XjS8#AT;qtY^6jzk2WD4EBg.8k]*OUP+6g<2ILwGcMKI4O(&\">vhGD}aEX2Ke_kgnqFSw^Pfzq5{g:!4QRgt.RjeQE2a0d-()IJWn93+1nJhCN:R?})(7p ;qN1S@BS;I5Iv+2XkuzThg1=y~.Ruv]?\\\\k'}\n\nCongratulations! No errors found\n" - ] - } - ], + "outputs": [], "source": [ "# Initial dataset stats\n", "print(\"Num examples:\", len(raw_dataset))\n", @@ -482,181 +449,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "6640b0269f754e699a856387a6e5f677", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "tokenizer_config.json: 0%| | 0.00/156 [00:00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n", "print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n", @@ -788,7 +561,8 @@ " training_duration=3,\n", " context_length=2048,\n", ")\n", - "temporary_mds_output_path = '/Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3'" + "temporary_mds_output_path = '/Volumes/main/mosaic_hackathon/managed-volume/mds_data_11Jan24_5'\n", + "# temporary_mds_output_path = '/tmp/CPT/mds_data_11Jan24_4'" ] }, { @@ -876,33 +650,10 @@ }, "outputs": [], "source": [ - "dbutils.fs.ls(FT_API_args.train_data_path)\n", - "\n", + "# dbutils.fs.ls(FT_API_args.train_data_path)\n", "output_location = FT_API_args.train_data_path + '/*.txt'\n", "df = spark.sql(\"SELECT * FROM read_files('%s')\" % output_location).withColumnRenamed('value', 'text')\n", - "df = df.collect() \n", - "df.show(2)\n", - "mds_kwargs = {\n", - " 'out': temporary_mds_output_path,\n", - " 'columns': {\n", - " 'tokens': 'bytes'\n", - " },\n", - " 'keep_local': True\n", - "}\n", - "udf_kwargs = {\n", - " 'concat_tokens': FT_API_args.context_length,\n", - " 'tokenizer': FT_API_args.model, \n", - " 'eos_text': '',\n", - " 'compression': 'zstd',\n", - " 'no_wrap': False,\n", - " 'bos_text': '',\n", - "}\n", - "\n", - "dataframe_to_mds(df,\n", - " merge_index=True,\n", - " mds_kwargs=mds_kwargs,\n", - " udf_iterable=pandas_processing_fn,\n", - " udf_kwargs=udf_kwargs)" + "df.show(2)" ] }, { @@ -942,36 +693,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:llmfoundry.utils.validation_utils:With udf_iterable defined, it's up to the user's discretion to provide mds_kwargs[columns]'\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f/lib/python3.10/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\nPerhaps you already have a cluster running?\nHosting the HTTP server on port 39531 instead\n warnings.warn(\nWARNING:streaming.base.storage.upload:Directory /Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3 exists and not empty. But continue to mkdir since exist_ok is set to be True.\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "A temporary folder /tmp/tmpp2gj2trw is created to store index files\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(('/Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3', ''),\n", - " 0)" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import dask.bag as db\n", "\n", @@ -979,8 +701,46 @@ "pattern = input_folder + '/*.txt'\n", "b = db.read_text(pattern, linedelimiter='\\n', blocksize='128MiB')\n", "df = b.to_dataframe(columns = ['text'])\n", - "df = df[df.text != '\\n']\n", + "df = df[df.text != '\\n']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3fbc7944-9b41-49d3-98d6-6eb91425d1ba", + "showTitle": false, + "title": "" + } + }, + "source": [ + "**3. dataframe_to_mds + tokenization:** \n", "\n", + "dataframe_to_mds is a utility function. It takes either a dask dataframe or a Spark dataframe, and a tokenization function and convert raw txt to MDS dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7c7aaeae-1c1b-498b-b97b-2d36b0e62938", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ "mds_kwargs = {\n", " 'out': temporary_mds_output_path,\n", " 'columns': {\n", @@ -1056,40 +816,7 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:streaming.base.storage.upload:Directory /Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3 exists and not empty. But continue to mkdir since exist_ok is set to be True.\nWARNING:streaming.base.storage.upload:Directory /Volumes/main/mosaic_hackathon/managed-volume/CPT/mds_data_11Jan24_3 exists and not empty. But continue to mkdir since exist_ok is set to be True.\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "Num examples: 456\nFirst example:\nITEM 1. BUSINESS GENERAL DEVELOPMENT OF BUSINESS Abbott Laboratories is an Illinois corporation, incorporated in 1900. The Company's* principal business is the discovery, development, manufacture, and sale of a broad and diversified line of health care products and services. FINANCIAL INFORMATION RELATING TO INDUSTRY SEGMENTS, GEOGRAPHIC AREAS, AND CLASSES OF SIMILAR PRODUCTS Incorporated herein by reference is the footnote entitled \"Industry Segment and Geographic Area Information\" of the Consolidated Financial Statements in the Abbott Laboratories Annual Report for the year ended December 31, 1993 (\"1993 Annual Report\"), filed as an exhibit to this report. Also incorporated herein by reference is the text and table of sales by class of similar products included in the section of the 1993 Annual Report captioned \"Financial Review.\" NARRATIVE DESCRIPTION OF BUSINESS PHARMACEUTICAL AND NUTRITIONAL PRODUCTS Included in this segment is a broad line of adult and pediatric pharmaceuticals and nutritionals. These products are sold primarily on the prescription or recommendation of physicians or other health care professionals. The segment also includes agricultural and chemical products, bulk pharmaceuticals, and consumer products. Principal pharmaceutical and nutritional products include the anti-infectives clarithromycin, sold in the United States under the trademark Biaxin-R- and outside the United States primarily under the trademark Klacid-R- and tosufloxacin, sold in Japan under the trademark Tosuxacin-TM-; various forms of the antibiotic erythromycin, sold primarily as PCE-R- or polymer-coated erythromycin, Erythrocin-R-, and E.E.S.-R-; agents for the treatment of epilepsy, including Depakote-R-; a broad line of cardiovascular products, including Loftyl-R-, a vasoactive agent sold outside the United States; Hytrin-R-, used as an anti-hypertensive and for the treatment of benign prostatic hyperplasia; Abbokinase-R-, a thrombolytic drug; Survanta-R-, a bovine derived lung surfactant; various forms of prepared infant formula, including Similac-R-, Isomil-R-, and Alimentum-R-; and other medical and pediatric nutritionals, including Ensure-R-, Ensure Plus-R-, Jevity-R-, Glucerna-R-, Advera-TM-, PediaSure-R-, Pedialyte-R- and Gain-R-. Consumer products include the dandruff shampoo Selsun Blue-R-; Murine-R- eye care and ear care products; Tronolane-R- hemorrhoid medication; and Faultless-R- rubber sundry products. Agricultural and chemical products include plant growth regulators, including ProGibb-R-; herbicides; larvicides, including Vectobac-R-; and biologically derived insecticides, including DiPel-R- and XenTari-R-. Pharmaceutical and nutritional products are generally sold directly to retailers, wholesalers, health care facilities, and government agencies. In most cases, they are distributed from Company-owned distribution centers or public warehouses. Certain products are co-marketed with other companies. In certain overseas countries, some of these products are marketed and distributed through distributors. Primary marketing efforts for pharmaceutical and nutritional products are directed toward securing the prescription or recommendation of the Company's brand of products by physicians or other health care professionals. Managed care purchasers, for example health maintenance organizations (HMOs) and pharmacy benefit managers, are becoming increasingly important customers. Competition is generally from other broad line and specialized health care manufacturers. A significant aspect of competition is the search for technological innovations. The - ------------------------ * As used throughout the text of this Report, the term \"Company\" refers to Abbott Laboratories, an Illinois corporation, or Abbott Laboratories and its consolidated subsidiaries, as the context requires. introduction of new products by competitors and changes in medical practices and procedures can result in product obsolescence. In addition, the substitution of generic drugs for the brand prescribed has increased competitive pressures on pharmaceutical products. Consumer products are promoted directly to the public by consumer advertising. These products are generally sold directly to retailers and wholesalers. Competitive products are sold by other diversified consumer and health care companies. Competitive factors include consumer advertising, scientific innovation, price, and availability of generic product forms. Agricultural and chemical products are generally sold to agricultural distributors and pharmaceutical companies. Competition is primarily from large chemical and agricultural companies and companies selling specialized agricultural products. Competition is based on numerous factors depending on the market served. Important competitive factors include product performance for specialized industrial and agricultural uses, price, and technological advantages. The Company is the leading worldwide producer of the antibiotic erythromycin. Similac-R- is the leading infant formula product in the United States. Under an agreement between the Company and Takeda Chemical Industries, Ltd. of Japan (Takeda), TAP Pharmaceuticals Inc. (TAP), owned 50 percent by the Company and 50 percent by Takeda, develops and markets in the United States products based on Takeda research. TAP markets Lupron-R-, an LH-RH analog, and Lupron Depot-R-, a sustained release form of Lupron-R-, in the United States. These agents are used for the treatment of advanced prostatic cancer, endometriosis, and central precocious puberty. The Company also has marketing rights to certain Takeda products in select Latin American markets. The Company also markets Lupron-R-, Lupron Depot-R-, and Lupron Depot-Ped-R- in select markets outside the United States. HOSPITAL AND LABORATORY PRODUCTS Hospital and laboratory products include diagnostic systems for blood banks, hospitals, commercial laboratories, and alternate-care testing sites; intravenous and irrigation fluids and related administration equipment, including electronic drug delivery systems; drugs and drug delivery systems; anesthetics; critical care products; and other medical specialty products for hospitals and alternate-care sites. The principal products included in this segment are parenteral (intravenous or I.V.) solutions and related administration equipment sold as the LifeCare-R- line of products, LifeShield-R- sets, and Venoset-R- products; irrigating fluids; parenteral nutritionals such as Aminosyn-R- and Liposyn-R-; Plum-R- and Omni-Flow-R- electronic drug delivery systems; Abbott Pain Management Provider-R-; patient-controlled analgesia (PCA) systems; venipuncture products; hospital injectables; premixed I.V. drugs in various containers; ADD-Vantage-R- and Nutrimix-R- drug and nutritional delivery systems; anesthetics, including Pentothal-R-, isoflurane, and enflurane; hemodynamic monitoring equipment; Calcijex-R-, an injectable agent for treatment of bone disease in hemodialysis patients; critical care products including Opticath-R-; screening tests for hepatitis B, HTLV-1, hepatitis B core, and hepatitis C; tests for detection of AIDS antibodies and antigens, and other infectious disease detection systems; tests for determining levels of abused drugs with the ADx-R- instrument; physiological diagnostic tests; cancer monitoring tests including tests for prostate specific antigen; laboratory tests and therapeutic drug monitoring systems such as TDx-R-; clinical chemistry systems such as Abbott Spectrum-R-, Abbott Spectrum-R- EPx-R-, Abbott Spectrum-R- CCx-TM-, and Quantum-TM-; Commander-R- and IMx-R- lines of diagnostic instruments and chemical reagents used with immunoassay diagnostics; Abbott Vision-R-, a desk-top blood analyzer, the Abbott TestPack-R- system for diagnostic testing, and a full line of hematology systems and reagents known as the Cell-Dyn-R- series. The hospital and laboratory products the Company expects to introduce in the United States in 1994 include: AxSym-TM-, a diagnostic system; Abbott Maestro-TM-, a data management system; and EnCounter-R-, a desktop hematology analyzer. The Company markets hospital and laboratory products in the United States and many other countries. These products are generally distributed to wholesalers and directly to hospitals, laboratories, and physicians' offices from distribution centers maintained by the Company. Sales are also made in the home infusion services market directly to patients receiving treatment outside the hospital through marketing arrangements with hospitals and other health care providers. Overseas sales are made either directly to customers or through distributors, depending on the market served. The hospital and laboratory products industry segment is highly competitive, both in the United States and overseas. This segment is subject to competition in technological innovation, price, convenience of use, service, instrument warranty provisions, product performance, long-term supply contracts, and product potential for overall cost effectiveness and productivity gains. Products in this segment can be subject to rapid product obsolescence. The Company has benefitted from technological advantages of certain of its current products; however, these advantages may be reduced or eliminated as competitors introduce new products. The Company is one of the leading domestic manufacturers of I.V. and irrigating solutions and related administration equipment, parenteral nutritional products, anesthesia products, and drug delivery systems. It is also the worldwide leader in in vitro diagnostic products, including thyroid tests, therapeutic drug monitoring, cancer monitoring tests, diagnostic tests for the detection of hepatitis and AIDS antibodies, and immunodiagnostic instruments. INFORMATION WITH RESPECT TO THE COMPANY'S BUSINESS IN GENERAL SOURCES AND AVAILABILITY OF RAW MATERIALS The Company purchases, in the ordinary course of business, necessary raw materials and supplies essential to the Company's operations from numerous suppliers in the United States and overseas. There have been no recent availability problems or significant supply shortages. PATENTS, TRADEMARKS, AND LICENSES The Company is aware of the desirability for patent and trademark protection for its products. The Company owns, has applications pending for, and is licensed under a substantial number of patents. Accordingly, where possible, patents and trademarks are sought and obtained for the Company's products in the United States and all countries of major marketing interest to the Company. Principal trademarks and the products they cover are discussed in the Narrative Description of Business on pages 1 and 2. These, and various patents which expire during the period 1994 to 2011, in the aggregate, are believed to be of material importance in the operation of the Company's business. However, the Company believes that no single patent, license, trademark, (or related group of patents, licenses, or trademarks) is material in relation to the Company's business as a whole. SEASONAL ASPECTS, CUSTOMERS, BACKLOG, AND RENEGOTIATION There are no significant seasonal aspects to the Company's business. The incidence of certain infectious diseases which occur at various times in different areas of the world does, however, affect the demand for the Company's anti-infective products. Orders for the Company's products are generally filled on a current basis, and order backlog is not material to the Company's business. No single customer accounted for sales equaling 10 percent or more of the Company's consolidated net sales. No material portion of the Company's business is subject to renegotiation of profits or termination of contracts at the election of the government. RESEARCH AND DEVELOPMENT The Company spent $880,974,000 in 1993, $772,407,000 in 1992, and $666,336,000 in 1991 on research to discover and develop new products and processes and to improve existing products and processes. The Company continues to concentrate research expenditures in pharmaceutical and diagnostic products. ENVIRONMENTAL MATTERS The Company believes that its operations comply in all material respects with applicable laws and regulations concerning environmental protection. Regulations under federal and state environmental laws impose stringent limitations on emissions and discharges to the environment from various manufacturing operations. The Company's capital and operating expenditures for pollution control in 1993 were approximately $32 million and $31 million, respectively. Capital and operating expenditures for pollution control are estimated to approximate $39 million and $36 million, respectively, in 1994. The Company is participating as one of many potentially responsible parties in investigation and/ or remediation at eight locations in the United States and Puerto Rico under the Comprehensive Environmental Response, Compensation, and Liability Act, commonly known as Superfund. The aggregate costs of remediation at these sites by all identified parties are uncertain but have been subject to widely ranging estimates totaling as much as several hundred million dollars. In many cases, the Company believes that the actual costs will be lower than these estimates, and the fraction for which the Company may be responsible is anticipated to be considerably less and will be paid out over a number of years. The Company expects to participate in the investigation or cleanup at these sites. The Company is also voluntarily investigating potential contamination at five Company-owned sites, and has initiated voluntary remediation at four Company-owned sites, in cooperation with the Environmental Protection Agency (EPA) or similar state agencies. While it is not feasible to predict with certainty the costs related to the previously described investigation and cleanup activities, the Company believes that such costs, together with other expenditures to maintain compliance with applicable laws and regulations concerning environmental protection, should not have a material adverse effect on the Company's earnings or competitive position. EMPLOYEES The Company employed 49,659 persons as of December 31, 1993. REGULATION The development, manufacture, sale, and distribution of the Company's products are subject to comprehensive government regulation, and the general trend is toward more stringent regulation. Government regulation by various federal, state, and local agencies, which includes detailed inspection of and controls over research and laboratory procedures, clinical investigations, and manufacturing, marketing, sampling, distribution, recordkeeping, storage and disposal practices, substantially increases the time, difficulty, and costs incurred in obtaining and maintaining the approval to market newly developed and existing products. Government regulatory actions can result in the seizure or recall of products, suspension or revocation of the authority necessary for their production and sale, and other civil or criminal sanctions. Continuing studies of the utilization, safety, and efficacy of health care products and their components are being conducted by industry, government agencies, and others. Such studies, which employ increasingly sophisticated methods and techniques, can call into question the utilization, safety, and efficacy of previously marketed products and in some cases have resulted, and may in the future result, in the discontinuance of marketing of such products and give rise to claims for damages from persons who believe they have been injured as a result of their use. The cost of human health care products continues to be a subject of investigation and action by governmental agencies, legislative bodies, and private organizations in the United States and other countries. In the United States, most states have enacted generic substitution legislation requiring or permitting a dispensing pharmacist to substitute a different manufacturer's version of a pharmaceutical product for the one prescribed. Federal and state governments continue to press efforts to reduce costs of Medicare and Medicaid programs, including restrictions on amounts agencies will reimburse for the use of products. Manufacturers must pay certain statutorily-prescribed rebates on Medicaid purchases for reimbursement on prescription drugs under state Medicaid plans. In addition, the Federal government follows a diagnosis-related group (DRG) payment system for certain institutional services provided under Medicare or Medicaid. The DRG system entitles a health care facility to a fixed reimbursement based on discharge diagnoses rather than actual costs incurred in patient treatment, thereby increasing the incentive for the facility to limit or control expenditures for many health care products. The Veterans Health Care Act of 1992 requires manufacturers to extend additional discounts on pharmaceutical products to various federal agencies, including the Department of Veterans Affairs, Department of Defense, and Public Health Service entities and institutions. In the United States, governmental cost-containment efforts have extended to the federally subsidized Special Supplemental Food Program for Women, Infants, and Children (WIC). All states participate in WIC and have sought and obtained rebates from manufacturers of infant formula whose products are used in the program. All of the states have also conducted competitive bidding for infant formula contracts which require the use of specific infant formula products for the state WIC program. The Child Nutrition and WIC Reauthorization Act of 1989 requires all states participating in WIC to engage in competitive bidding upon the expiration of their existing infant formula contracts. Governmental regulatory agencies now require manufacturers to pay additional fees. Under the Prescription Drug User Fee Act of 1992, the Federal Food and Drug Administration imposes substantial fees on various aspects of the approval, manufacture and sale of prescription drugs. Congress is now considering expanding user fees to medical devices. The Company believes that such legislation, if enacted, will add considerable expense for the Company. In the United States comprehensive legislation has been proposed that would make significant changes to the availability, delivery and payment for healthcare products and services. It is the intent of such proposed legislation to provide health and medical insurance for all United States citizens and to reduce the rate of increases in United States healthcare expenditures. If such legislation is enacted, the Company believes it could have the effect of reducing prices for, or reducing the rate of price increases for health and medical insurance and medical products and services. International operations are also subject to a significant degree of government regulation. Many countries, directly or indirectly through reimbursement limitations, control the selling price of most health care products. Furthermore, many developing countries limit the importation of raw materials and finished products. International regulations are having an impact on United States regulations, as well. The International Organization for Standardization (\"ISO\") provides the voluntary criteria for regulating medical devices within the European Economic Community. The Food and Drug Administration (\"FDA\") has announced that it will attempt to harmonize its regulation of medical devices with that of the ISO. Recently published changes to the FDA's regulations governing the manufacture of medical devices appear to encompass and exceed the ISO's approach to regulating medical devices. The FDA's adoption of the ISO's approach to regulation and other changes to the manner in which the FDA regulates medical devices will increase the cost of compliance with those regulations. Efforts to reduce health care costs are also being made in the private sector. Health care providers have responded by instituting various cost reduction and containment measures. It is not possible to predict the extent to which the Company or the health care industry in general might be affected by the matters discussed above. INTERNATIONAL OPERATIONS The Company markets products in approximately 130 countries through affiliates and distributors. Most of the products discussed in the preceding sections of this report are sold outside the United States. In addition, certain products of a local nature and variations of product lines to meet local regulatory requirements and marketing preferences are manufactured and marketed to customers outside the United States. International operations are subject to certain additional risks inherent in conducting business outside the United States, including price and currency exchange controls, changes in currency exchange rates, limitations on foreign participation in local enterprises, expropriation, nationalization, and other governmental action. ITEM 2.\n\n\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "output_type": "stream", - "text": [ - "/local_disk0/.ephemeral_nfs/envs/pythonEnv-5d6eadb9-688e-4900-84da-417027122f1f/lib/python3.10/site-packages/streaming/base/dataset.py:397: UserWarning: Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).\n warnings.warn(f'Because `predownload` was not specified, it will default to ' +\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "ITEM 1. BUSINESS GENERAL DEVELOPMENT OF BUSINESS Abbott Laboratories is an Illinois corporation, incorporated in 1900. Abbott's* principal business is the discovery, development, manufacture, and sale of a broad and diversified line of health care products. FINANCIAL INFORMATION RELATING TO INDUSTRY SEGMENTS, GEOGRAPHIC AREAS, AND CLASSES OF SIMILAR PRODUCTS Incorporated herein by reference is Note 6 entitled \"Segment and Geographic Area Information\" of the Notes to Consolidated Financial Statements included under Item 8, \"Financial Statements and Supplementary Data\" and the sales information related to HUMIRA® included in \"Financial Review.\" NARRATIVE DESCRIPTION OF BUSINESS Through December 31, 2012, Abbott had five reportable revenue segments: Proprietary Pharmaceutical Products, Established Pharmaceutical Products, Diagnostic Products, Nutritional Products, and Vascular Products. On January 1, 2013, Abbott completed the separation of its research-based pharmaceuticals business through the distribution of the issued and outstanding common stock of AbbVie Inc. (AbbVie) to Abbott's shareholders. AbbVie was formed to hold Abbott's research-based pharmaceuticals business and, as a result of the distribution, is now an independent public company trading under the symbol \"ABBV\" on the New York Stock Exchange. *As used throughout the text of this report on Form 10-K, the term \"Abbott\" refers to Abbott Laboratories, an Illinois corporation, or Abbott Laboratories and its consolidated subsidiaries, as the context requires. Proprietary Pharmaceutical Products These products include a broad line of adult and pediatric pharmaceuticals manufactured, marketed, and sold worldwide (except as noted) and are generally sold directly to wholesalers, distributors, government agencies, health care facilities, specialty pharmacies, and independent retailers from distribution centers and public warehouses. Outside the United States, sales are made either directly to customers or through distributors, depending on the market served. Certain products are co-marketed or co-promoted with other companies. As a result of the separation of Abbott's research-based pharmaceuticals business, beginning in 2013, Abbott will no longer have a Proprietary Pharmaceutical Products segment. The principal products included in the Proprietary Pharmaceutical Products segment are: •HUMIRA®, for the treatment of rheumatoid arthritis, psoriatic arthritis, ankylosing spondylitis, psoriasis, juvenile idiopathic arthritis, and Crohn's disease as well as ulcerative colitis in the United States and European Union and axial spondyloarthritis and pediatric Crohn's disease in the European Union; •Kaletra®, also marketed as Aluvia®, and Norvir® for the treatment of HIV infection; •Lupron®, also marketed as Lucrin®, used for the palliative treatment of advanced prostate cancer, treatment of endometriosis and central precocious puberty, and for the preoperative treatment of patients with anemia caused by uterine fibroids; •Synagis®, for the prevention of respiratory syncytial virus (RSV); •AndroGel®, for the treatment of adult males who have low testosterone (marketed and sold in the United States); •the anesthesia product sevoflurane (sold under the trademarks Ultane® and Sevorane®); •Zemplar®, for the prevention and treatment of secondary hyperparathyroidism associated with Stage 3, 4, or 5 chronic kidney disease; •Synthroid®, for the treatment of hypothyroidism (marketed and sold in the United States); •Creon®, for the treatment of pancreatic exocrine insufficiency associated with several underlying conditions, including cystic fibrosis and chronic pancreatitis (marketed and sold in the United States); and •TriCor®, Trilipix®, Simcor®, and Niaspan®, for the treatment of dyslipidemia (marketed and sold in the United States). The Proprietary Pharmaceutical Products segment directs its primary marketing efforts toward securing the prescription, or recommendation, of its pharmaceutical products by physicians. Managed care providers, market access organizations (for example, health maintenance organizations and pharmacy benefit managers) and national and regional governments and agencies (for example, the United States Department of Veterans Affairs and the United States Department of Defense) are also important customers. Competition in the Proprietary Pharmaceutical Products segment is generally from other health care and pharmaceutical companies. The search for technological innovations in pharmaceutical products is a significant aspect of competition in this segment. The introduction of new products by competitors and changes in medical practices and procedures can result in product obsolescence in the Proprietary Pharmaceutical Products segment. Price can also be a factor. In addition, the substitution of generic drugs for the brand prescribed has increased competitive pressures on pharmaceutical products that do not have patent protection. Established Pharmaceutical Products These products include a broad line of branded generic pharmaceuticals manufactured worldwide and marketed and sold outside the United States, and are generally sold directly to wholesalers, distributors, government agencies, health care facilities, specialty pharmacies, and independent retailers from Abbott-owned distribution centers and public warehouses, depending on the market served. Certain products are co-marketed or co-promoted with other companies. The principal products included in the Established Pharmaceutical Products segment are: •Creon®, for the treatment of pancreatic exocrine insufficiency associated with several underlying conditions, including cystic fibrosis and chronic pancreatitis (marketed and sold outside the United States); •the anti-infective clarithromycin (sold under the trademarks Biaxin®, Klacid®, and Klaricid®); •Influvac®, an influenza vaccine available during flu season; •Serc®, for the treatment of Ménière's disease and vestibular vertigo; •Brufen®, for the treatment of pain, fever and inflammation; •Synthroid®, for the treatment of hypothyroidism (marketed and sold outside the United States); •Duspatal® and Dicetel®, for the treatment of irritable bowel syndrome or biliary spasm; •Duphaston®, for the treatment of many different gynecological disorders; •Adomet®, Heptral®, Transmetil®, Samyr®, and Donamet®, for the treatment of intrahepatic cholestasis (associated with liver disease) or depressive symptoms; •Duphalac®, for regulation of the physiological rhythm of the colon; •Lipanthyl® and TriCor®, for the treatment of dyslipidemia (marketed and sold outside the United States); and •Teveten® and Teveten® Plus, for the treatment of essential hypertension, and Physiotens®, for the treatment of hypertension. The Established Pharmaceutical Products segment directs its primary marketing efforts toward securing the prescription, or recommendation, of Abbott's brand of products by physicians both in the primary care and secondary (hospital) care environment. Government agencies are also important customers. Competition in the Established Pharmaceutical Products segment is generally from other health care and pharmaceutical companies. Changes to government tenders and reimbursement schemes are significant factors with respect to pricing. In addition, the substitution of generic drugs for the brand prescribed and introduction of additional forms of already marketed established products by generic or branded competitors have increased competitive pressures. Diagnostic Products These products include a broad line of diagnostic systems and tests manufactured, marketed, and sold worldwide to blood banks, hospitals, commercial laboratories, clinics, physicians' offices, government agencies, alternate-care testing sites, and plasma protein therapeutic companies. The segment's products are generally marketed and sold directly from Abbott-owned distribution centers, public warehouses and third-party distributors. Outside the United States, sales are made either directly to customers or through distributors, depending on the market served. The principal products included in the Diagnostic Products segment are: •immunoassay and clinical chemistry systems, including ARCHITECT® and ABBOTT PRISM®; •assays used for screening and/or diagnosis for drugs of abuse, cancer, therapeutic drug monitoring, fertility, physiological diseases, and infectious diseases such as hepatitis and HIV; •the m2000™, an instrument that automates the extraction, purification, and preparation of DNA and RNA from patient samples, and detects and measures infectious agents including HIV, HBV, HCV, HPV, and CT/NG; •the Vysis® product line of genomic-based tests, including the PathVysion® HER-2 DNA probe kit; the UroVysion® bladder cancer recurrence kit; and the Vysis ALK Break Apart FISH Probe Kit, the only FDA-approved companion diagnostic to Pfizer's approved non-small-cell lung cancer therapy XALKORI®; •informatics and automation solutions for use in the laboratory; •a full line of hematology systems and reagents known as the Cell-Dyn® series; and •the i-STAT® point-of-care diagnostic systems and tests for blood analysis. In addition, under a distribution agreement with Celera Group, the Diagnostic Products segment exclusively distributes certain Celera molecular diagnostic products, including the ViroSeq® HIV genotyping system and products used for the detection of mutations in the CFTR gene, which causes cystic fibrosis. The Diagnostic Products segment's products are subject to competition in technological innovation, price, convenience of use, service, instrument warranty provisions, product performance, long-term supply contracts, and product potential for overall cost-effectiveness and productivity gains. Some products in this segment can be subject to rapid product obsolescence or regulatory changes. Although Abbott has benefited from technological advantages of certain of its current products, these advantages may be reduced or eliminated as competitors introduce new products. Nutritional Products These products include a broad line of pediatric and adult nutritional products manufactured, marketed, and sold worldwide. These products are generally marketed and sold directly to customers and to institutions, wholesalers, retailers, health care facilities, government agencies, and third\n\n-party distributors from Abbott-owned distribution centers or third-party distributors. The principal products included in the Nutritional Products segment are: •various forms of prepared infant formula and follow-on formula, including Similac®Advance®, Similac® Advance® with EarlyShield®, Similac®, Similac® with Iron, Similac Sensitive®, Similac Sensitive® RS, Similac Go&Grow®, Similac® NeoSure®, Similac® Organic, Similac Special Care®, Similac® Total Comfort®, Isomil® Advance®, Isomil®, Alimentum®, Gain®, and Grow®; •adult and other pediatric nutritional products, including Ensure®, Ensure Plus®, Ensure® Muscle Health, Ensure® (with Nutrivigor®), Glucerna®, Glucerna® Hunger Smart®, ProSure®, PediaSure®, PediaSure Sidekicks®, EleCare®, Juven®, Abound®, and Pedialyte®; •nutritional products used in enteral feeding in health care institutions, including Jevity®, Glucerna® 1.2 Cal, Glucerna® 1.5 Cal, Osmolite®, Oxepa®, Freego (Enteral Pump) and Freego® sets, and Nepro®; and •Zone Perfect® bars and the EAS® family of nutritional brands, including Myoplex® and AdvantEdge®. Primary marketing efforts for nutritional products are directed toward securing the recommendation of Abbott's brand of products by physicians or other health care professionals. In addition, certain nutritional products sold as Gain™, Grow™, PediaSure®, PediaSure Sidekicks®, Pedialyte®, Ensure®, Zone Perfect®, EAS®/Myoplex®, and Glucerna® are also promoted directly to the public by consumer marketing efforts in select markets. Competition for nutritional products in the segment is generally from other diversified consumer and health care manufacturers. Competitive factors include consumer advertising, formulation, packaging, scientific innovation, intellectual property, price, and availability of product forms. A significant aspect of competition is the search for ingredient innovations. The introduction of new products by competitors, changes in medical practices and procedures, and regulatory changes can result in product obsolescence. In addition, private label and local manufacturers' products may increase competitive pressure. Vascular Products These products include a broad line of coronary, endovascular, vessel closure, and structural heart devices for the treatment of vascular disease manufactured, marketed and sold worldwide. The segment's products are generally marketed and sold directly to hospitals from Abbott-owned distribution centers and public warehouses. Outside the United States, sales are made either directly to customers or through distributors, depending on the market served. The principal products included in the Vascular Products segment are: •Xience Xpedition®, Xience Prime®, Xience nano™, and Xience V®, drug-eluting coronary stent systems developed on the Multi-Link Vision® platform; •Absorb®, a drug-eluting coronary bioresorbable vascular scaffold; •Multi-Link 8®, Multi-Link Vision® and Multi-Link Mini Vision®, coronary metallic stents; •TREK® and Voyager®, coronary balloon dilatation products; •Hi-Torque Balance Middleweight Elite® and ASAHI® coronary guidewires (licensed from Asahi Intecc Co., Ltd.); •StarClose® and Perclose® vessel closure devices; •Acculink®/Accunet® and Xact®/Emboshield NAV6®, carotid stent systems; •Armada® and Absolute Pro Peripheral® balloon dilatation products; •Herculink Elite Renal® and Omnilink Elite Iliac® stent systems; and •MitraClip®, a percutaneous valve repair system. The Vascular Products segment's products are subject to competition in technological innovation, price, convenience of use, service, product performance, long-term supply contracts, and product potential for overall cost-effectiveness and productivity gains. Some products in this segment can be subject to rapid product obsolescence or regulatory changes. Although Abbott has benefited from technological advantages of certain of its current products, these advantages may be reduced or eliminated as competitors introduce new products. Other Products The principal products in Abbott's other businesses include blood glucose monitoring meters, test strips, data management software and accessories for people with diabetes, including the FreeStyle® product line, and medical devices for the eye, including cataract surgery, LASIK surgery, contact lens care products, and dry eye products. These products are mostly marketed worldwide and generally sold directly to wholesalers, government agencies, health care facilities, mail order pharmacies, and independent retailers from Abbott-owned distribution centers and public warehouses. Some of these products are marketed and distributed through distributors. Blood glucose monitoring meters, contact lens care products, and dry eye products are also marketed and sold over-the-counter to consumers. These products are subject to competition in technological innovation, price, convenience of use, service, and product performance. Medical devices for the eye also can be subject to rapid product obsolescence or regulatory changes. INFORMATION WITH RESPECT TO ABBOTT'S BUSINESS IN GENERAL Sources and Availability of Raw Materials Abbott purchases, in the ordinary course of business, raw materials and supplies essential to Abbott's operations from numerous suppliers in the United States and abroad. There have been no recent significant availability problems or supply shortages. Patents, Trademarks, and Licenses Abbott is aware of the desirability for patent and trademark protection for its products. Accordingly, where possible, patents and trademarks are sought and obtained for Abbott's products in the United States and all countries of major marketing interest to Abbott. Abbott owns and is licensed under a substantial number of patents and patent applications. Principal trademarks and the products they cover are discussed in the Narrative Description of Business on pages 1 through 5. These, and various patents which expire during the period 2013 to 2032, in the aggregate, are believed to be of material importance in the operation of Abbott's business. Abbott believes that, after the separation of AbbVie, no single patent, license, or trademark is material in relation to Abbott's business as a whole. In connection with the separation and distribution of AbbVie, Abbott contributed certain pharmaceutical related patents, licenses, and trademarks to AbbVie. Patent-related litigation is discussed in Legal Proceedings on pages 18 through 20. Seasonal Aspects, Customers, Backlog, and Renegotiation There are no significant seasonal aspects to Abbott's business. Abbott has no single customer that, if the customer were lost, would have a material adverse effect on Abbott. Orders for Abbott's products are generally filled on a current basis, and order backlog is not material to Abbott's business. No material portion of Abbott's business is subject to renegotiation of profits or termination of contracts at the election of the government. Research and Development Abbott spent approximately $4.3 billion in 2012, $4.1 billion in 2011, and $3.7 billion in 2010, on research to discover and develop new products and processes and to improve existing products and processes. The majority of research and development expenditures was concentrated on proprietary pharmaceutical products. Environmental Matters Abbott believes that its operations comply in all material respects with applicable laws and regulations concerning environmental protection. Regulations under federal and state environmental laws impose stringent limitations on emissions and discharges to the environment from various manufacturing operations. Abbott's capital and operating expenditures for pollution control in 2012 were approximately $12 million and $63 million, respectively. After the separation of AbbVie, capital and operating expenditures for pollution control in 2013 are estimated to be $10 million and $53 million, respectively. Abbott has been identified as one of many potentially responsible parties in investigations and/or remediations at several locations in the United States, including Puerto Rico, under the Comprehensive Environmental Response, Compensation, and Liability Act, commonly known as Superfund. Abbott is also engaged in remediation at several other sites, some of which are owned by Abbott, in cooperation with the Environmental Protection Agency (EPA) or similar agencies. While it is not feasible to predict with certainty the final costs related to those investigations and remediation activities, Abbott believes that such costs, together with other expenditures to maintain compliance with applicable laws and regulations concerning environmental protection, should not have a material adverse effect on Abbott's financial position, cash flows, or results of operations. Employees Abbott employed approximately 91,000 persons as of December 31, 2012. Approximately 21,000 persons were transferred to AbbVie in connection with the separation. Regulation The development, manufacture, marketing, sale, promotion, and distribution of Abbott's products are subject to comprehensive government regulation by the U.S. Food and Drug Administration and similar international regulatory agencies. Government regulation by various international, supranational, federal and state agencies, both domestic and international, addresses (among other matters) the development and approval to market Abbott's products, as well as the inspection of, and controls over, research and laboratory procedures, clinical investigations, product approvals and manufacturing, labeling, packaging, supply chains, marketing and promotion, pricing and reimbursement, sampling, distribution, quality control, post-market surveillance, record keeping, storage, and disposal practices. Abbott's international operations are also affected by trade regulations in many countries that limit the import of raw materials and finished products and by local and international laws and regulations that seek to prevent corruption and bribery in the marketplace (including the United States Foreign Corrupt Practices Act and the United Kingdom Bribery Act which provide among other things, guidance on corporate interactions with government officials). In addition, Abbott is subject to laws and regulations pertaining to health care fraud and abuse, including state and federal\n\n anti-kickback and false claims laws in the United States. Prescription drug, nutrition, and medical device manufacturers such as Abbott are also subject to taxes, as well as application, product, user, establishment, and other fees. Governmental agencies can also invalidate intellectual property rights and control the entrance of multi-source drugs for small molecule and generic biologic medicines. Compliance with these laws and regulations is costly and materially affects Abbott's business. Among other effects, health care regulations substantially increase the time, difficulty, and costs incurred in obtaining and maintaining approval to market newly developed and existing products. Abbott expects this regulatory environment will continue to require significant technical expertise and capital investment to ensure compliance. Failure to comply can delay the release of a new product or result in regulatory and enforcement actions, the seizure or recall of a product, the suspension or revocation of the authority necessary for a product's production and sale, and other civil or criminal sanctions, including fines and penalties. In addition to regulatory initiatives, Abbott's business can be affected by ongoing studies of the utilization, safety, efficacy, and outcomes of health care products and their components that are regularly conducted by industry participants, government agencies, and others. These studies can call into question the utilization, safety, and efficacy of previously marketed products. In some cases, these studies have resulted, and may in the future result, in the discontinuance of marketing of such products domestically or globally, and may give rise to claims for damages from persons who believe they have been injured as a result of their use. Access to human health care products continues to be a subject of investigation and action by governmental agencies, legislative bodies, and private organizations in the United States and other countries. A major focus is cost containment. Efforts to reduce health care costs are also being made in the private sector, notably by health care payors and providers, which have instituted various cost reduction and containment measures. Abbott expects insurers and providers to continue attempts to reduce the cost of health care products. Many countries control the price of health care products directly or indirectly, through reimbursement, payment, pricing, coverage limitations, or compulsory licensing, and are adopting laws and rules to govern the introduction of biosimilar products. Domestic and foreign budgetary pressures may also heighten the scope and severity of pricing pressures on Abbott's products for the foreseeable future. Specifically, U.S. federal laws requiring pharmaceutical manufacturers to pay certain statutorily-prescribed rebates to state Medicaid programs on prescription drugs reimbursed under state Medicaid plans, and the efforts by states to seek additional rebates, affect Abbott's proprietary pharmaceutical business. Similarly, the Veterans Health Care Act of 1992 requires manufacturers to extend additional discounts on pharmaceutical products to various federal agencies, including the Department of Veterans Affairs, Department of Defense, Public Health Service entities and institutions, as well as certain other covered entities. The Veterans Health Care Act also established the 340B drug discount program, which requires pharmaceutical manufacturers to provide products at reduced prices to designated health care facilities. In the United States, most states also have generic substitution legislation requiring or permitting a dispensing pharmacist to substitute a different manufacturer's version of a pharmaceutical product for the one prescribed. In addition, the federal government follows a diagnosis-related group (DRG) payment system for certain institutional services provided under Medicare or Medicaid and has implemented a prospective payment system (PPS) for services delivered in hospital outpatient, nursing home, and home health settings. DRG and PPS entitle a health care facility to a fixed reimbursement based on the diagnosis and/or procedure rather than actu\n\n*** WARNING: max output size exceeded, skipping output. ***\n\nall available on Abbott's investor relations website (www.abbottinvestor.com). ITEM 1A.\nITEM 1A. RISK FACTORS In addition to the other information in this report, the following risk factors should be considered before deciding to invest in any of Abbott's securities. Additional risks and uncertainties not presently known to Abbott, or risks Abbott currently considers immaterial, could also affect Abbott's actual results. Abbott's business, financial condition, results of operations, or prospects could be materially adversely affected by any of these risks. Abbott may acquire other businesses, license rights to technologies or products, form alliances, or dispose of or spin-off businesses, which could cause it to incur significant expenses and could negatively affect profitability. Abbott may pursue acquisitions, technology licensing arrangements, and strategic alliances, or dispose of or spin-off some of its businesses, as part of its business strategy. Abbott may not complete these transactions in a timely manner, on a cost-effective basis, or at all, and may not realize the expected benefits. If Abbott is successful in making an acquisition, the products and technologies that are acquired may not be successful or may require significantly greater resources and investments than originally anticipated. Abbott may not be able to integrate acquisitions successfully into its existing business and could incur or assume significant debt and unknown or contingent liabilities. Abbott could also experience negative effects on its reported results of operations from acquisition or disposition-related charges, amortization of expenses related to intangibles and charges for impairment of long-term assets. These effects could cause a deterioration of Abbott's credit rating and result in increased borrowing costs and interest expense. The expiration or loss of patent protection and\n\n licenses may affect Abbott's future revenues and operating income. Many of Abbott's businesses rely on patent and trademark and other intellectual property protection. Although most of the challenges to Abbott's intellectual property have come from other businesses, governments may also challenge intellectual property protections. To the extent Abbott's intellectual property is successfully challenged, invalidated, or circumvented or to the extent it does not allow Abbott to compete effectively, Abbott's business will suffer. To the extent that countries do not enforce Abbott's intellectual property rights or to the extent that countries require compulsory licensing of its intellectual property, Abbott's future revenues and operating income will be reduced. Abbott's patents and trademarks are described in greater detail in the section captioned \"Patents, Trademarks, and Licenses,\" and litigation regarding these patents is described in the section captioned \"Legal Proceedings.\" Competitors' intellectual property may prevent Abbott from selling its products or have a material adverse effect on Abbott's future profitability and financial condition. Competitors may claim that an Abbott product infringes upon their intellectual property. Resolving an intellectual property infringement claim can be costly and time consuming and may require Abbott to enter into license agreements. Abbott cannot guarantee that it would be able to obtain license agreements on commercially reasonable terms. A successful claim of patent or other intellectual property infringement could subject Abbott to significant damages or an injunction preventing the manufacture, sale or use of affected Abbott products. Any of these events could have a material adverse effect on Abbott's profitability and financial condition. Abbott is subject to cost containment efforts that could cause a reduction in future revenues and operating income. In the United States and other countries, Abbott's businesses have experienced downward pressure on product pricing. Cost containment efforts by governments and private organizations are described in greater detail in the section captioned \"Regulation.\" To the extent these cost containment efforts are not offset by greater patient access to health care or other factors, Abbott's future revenues and operating income will be reduced. Abbott is subject to numerous governmental regulations and it can be costly to comply with these regulations and to develop compliant products and processes. Abbott's products are subject to rigorous regulation by the U.S. Food and Drug Administration, and numerous international, supranational, federal, and state authorities. The process of obtaining regulatory approvals to market a drug or medical device can be costly and time-consuming, and approvals might not be granted for future products, or additional indications or uses of existing products, on a timely basis, if at all. Delays in the receipt of, or failure to obtain approvals for, future products, or new indications and uses, could result in delayed realization of product revenues, reduction in revenues, and in substantial additional costs. In addition, no assurance can be given that Abbott will remain in compliance with applicable FDA and other regulatory requirements once clearance or approval has been obtained for a product. These requirements include, among other things, regulations regarding manufacturing practices, product labeling, and advertising and postmarketing reporting, including adverse event reports and field alerts due to manufacturing quality concerns. Many of Abbott's facilities and procedures and those of Abbott's suppliers are subject to ongoing regulation, including periodic inspection by the FDA and other regulatory authorities. Abbott must incur expense and spend time and effort to ensure compliance with these complex regulations. Possible regulatory actions for non-compliance could include warning letters, fines, damages, injunctions, civil penalties, recalls, seizures of Abbott's products, and criminal prosecution. These actions could result in, among other things, substantial modifications to Abbott's business practices and operations; refunds, recalls, or seizures of Abbott's products; a total or partial shutdown of production in one or more of Abbott's facilities while Abbott or Abbott's suppliers remedy the alleged violation; the inability to obtain future pre-market clearances or approvals; and withdrawals or suspensions of current products from the market. Any of these events could disrupt Abbott's business and have a material adverse effect on Abbott's revenues, profitability and financial condition. Laws and regulations affecting government benefit programs could impose new obligations on Abbott, require Abbott to change its business practices, and restrict its operations in the future. Abbott's industry is also subject to various federal, state, and international laws and regulations pertaining to government benefit program reimbursement, price reporting and regulation, and health care fraud and abuse, including anti-kickback and false claims laws, and international and individual state laws relating to pricing and sales and marketing practices. Violations of these laws may be punishable by criminal and/or civil sanctions, including, in some instances, substantial fines, imprisonment, and exclusion from participation in federal and state health care programs, including Medicare, Medicaid, and Veterans Administration health programs. These laws and regulations are broad in scope and they are subject to evolving interpretations, which could require Abbott to incur substantial costs associated with compliance or to alter one or more of its sales or marketing practices. In addition, violations of these laws, or allegations of such violations, could disrupt Abbott's business and result in a material adverse effect on Abbott's revenues, profitability, and financial condition. Changes in the health care regulatory environment may adversely affect Abbott's business. A number of the provisions of the Patient Protection and Affordable Care Act and the Health Care and Education Reconciliation Act of 2010 require further rulemaking action by governmental agencies to implement. The laws change access to health care products and services and create new fees for the pharmaceutical and medical device industries. Future rulemaking could increase rebates, reduce prices or the rate of price increases for health care products and services, or require additional reporting and disclosure. Abbott cannot predict the timing or impact of any future rulemaking. Abbott's research and development efforts may not succeed in developing commercially successful products and technologies, which may cause Abbott's revenue and profitability to decline. To remain competitive, Abbott must continue to launch new products and technologies. To accomplish this, Abbott commits substantial efforts, funds, and other resources to research and development. A high rate of failure is inherent in the research and development of new products and technologies. Abbott must make ongoing substantial expenditures without any assurance that its efforts will be commercially successful. Failure can occur at any point in the process, including after significant funds have been invested. Promising new product candidates may fail to reach the market or may only have limited commercial success because of efficacy or safety concerns, failure to achieve positive clinical outcomes, inability to obtain necessary regulatory approvals, limited scope of approved uses, excessive costs to manufacture, the failure to establish or maintain intellectual property rights, or infringement of the intellectual property rights of others. Even if Abbott successfully develops new products or enhancements or new generations of Abbott's existing products, they may be quickly rendered obsolete by changing customer preferences, changing industry standards, or competitors' innovations. Innovations may not be accepted quickly in the marketplace because of, among other things, entrenched patterns of clinical practice or uncertainty over third-party reimbursement. Abbott cannot state with certainty when or whether any of its products under development will be launched, whether it will be able to develop, license, or otherwise acquire compounds or products, or whether any products will be commercially successful. Failure to launch successful new products or new indications for existing products may cause Abbott's products to become obsolete, causing Abbott's revenues and operating results to suffer. New products and technological advances by Abbott's competitors may negatively affect Abbott's results of operations. Abbott's products face intense competition from its competitors' products. Competitors' products may be safer, more effective, more effectively marketed or sold, or have lower prices or superior performance features than Abbott's products. Abbott cannot predict with certainty the timing or impact of the introduction of competitors' products. The manufacture of many of Abbott's products is a highly exacting and complex process, and if Abbott or one of its suppliers encounters problems manufacturing products, Abbott's business could suffer. The manufacture of many of Abbott's products is a highly exacting and complex process, due in part to strict regulatory requirements. Problems may arise during manufacturing for a variety of reasons, including equipment malfunction, failure to follow specific protocols and procedures, problems with raw materials, natural disasters, and environmental factors. In addition, single suppliers are currently used for certain products and materials. If problems arise during the production of a batch of product, that batch of product may have to be discarded. This could, among other things, lead to increased costs, lost revenue, damage to customer relations, time and expense spent investigating the cause and, depending on the cause, similar losses with respect to other batches or products. If problems are not discovered before the product is released to the market, recall and product liability costs may also be incurred. To the extent Abbott or one of its suppliers experiences significant manufacturing problems, this could have a material adverse effect on Abbott's revenues and profitability. Significant safety issues could arise for Abbott's products, which could have a material adverse effect on Abbott's revenues and financial condition. Health care products typically receive regulatory approval based on data obtained in controlled clinical trials of limited duration. Following regulatory approval, these products will be used over longer periods of time in many patients. Investigators may also conduct additional, and perhaps more extensive, studies. If new safety issues are reported, Abbott may be required to amend the conditions of use for a product. For example, Abbott may be required to provide additional warnings on a product's label or narrow its approved intended use, either of which could reduce the product's market acceptance. If serious safety issues arise with an Abbott product, sales of the product could be halted by Abbott or by regulatory authorities. Safety issues affecting suppliers' or competitors' products also may reduce the market acceptance of Abbott's products. In addition, in the ordinary course of business, Abbott is the subject of product liability claims and lawsuits alleging that its products or the products of other companies that Abbott promotes have resulted or could result in an unsafe condition for or injury to patients. Product liability claims and lawsuits and safety alerts or product recalls, regardless of their validity or ultimate outcome, may have a material adverse effect on Abbott's business and reputation and on Abbott's ability to attract and retain customers. Consequences\n\n may also include additional costs, a decrease in market share for the products, lower income or exposure to other claims. Product liability losses are self-insured. Product liability claims could have a material adverse effect on Abbott's profitability and financial condition. Further deterioration in the economic position and credit quality of certain European countries may negatively affect Abbott's results of operations. If economic conditions in certain European countries, including Greece, Portugal, Italy, and Spain, continue to worsen, the time it takes to collect outstanding trade receivables may increase. Financial instability and fiscal deficits in these countries may result in additional austerity measures to reduce costs, including health care. At the same time, ongoing sovereign debt issues, including the impact of credit downgrades, could increase Abbott's collection risk given that a significant amount of Abbott's receivables in these countries are with governmental health care systems. Abbott depends on sophisticated information technology systems to operate its business and a cyber attack or other breach of these systems could have a material adverse effect on Abbott's results of operations. Similar to other large multi-national companies, the size and complexity of Abbott's information technology systems makes them vulnerable to a cyber attack, malicious intrusion, breakdown, destruction, loss of data privacy, or other significant disruption. Abbott's systems have been and are expected to continue to be the target of malware and other cyber attacks. Abbott has invested in its systems and the protection of its data to reduce the risk of an invasion or interruption and monitors its systems on an ongoing basis for any current or potential threats. There can be no assurance that these measures and efforts will prevent future interruptions or breakdowns that could have a significant effect on Abbott's business. Abbott may incur operational difficulties or be exposed to claims and liabilities as a result of the separation. AbbVie and Abbott entered into a separation and distribution agreement and various other agreements to govern the separation of AbbVie from Abbott and the relationship between the two companies going forward. Certain of these agreements provide for the performance of services by each company for the benefit of the other for a period of time. If AbbVie is unable to satisfy its obligations under these agreements, including its indemnification obligations, Abbott could incur operational difficulties or losses. These arrangements could also lead to disputes between Abbott and AbbVie over Abbott's rights to certain shared property and rights and over the allocation of costs and revenues for products and operations. The separation and distribution agreement also provides for, among other things, indemnification obligations designed to make AbbVie financially responsible for substantially all liabilities that may exist relating to its business activities, whether incurred prior to or after AbbVie's separation from Abbott, as well as those obligations of Abbott assumed by AbbVie pursuant to the separation and distribution agreement. It is possible that a court would disregard the allocation agreed to between Abbott and AbbVie and require Abbott to assume responsibility for obligations allocated to AbbVie. Third parties could also seek to hold Abbott responsible for any of these liabilities or obligations. The indemnity rights Abbott has under the separation agreement may not be sufficient to protect Abbott. Even if Abbott is successful in obtaining indemnification, Abbott may have to bear losses temporarily. In addition, Abbott's indemnity obligations to AbbVie may be significant. These risks could negatively affect Abbott's results of operations. There could be significant liability if the distribution of AbbVie common stock to Abbott shareholders is determined to be a taxable transaction. Abbott received a private letter ruling from the Internal Revenue Service (IRS) to the effect that, among other things, the separation and the distribution of AbbVie qualifies as a transaction that is tax-free for U.S. federal income tax purposes under Sections 355 and 368(a)(1)(D) of the Internal Revenue Code (the Code). In addition, Abbott received an opinion from outside tax counsel to the effect that the separation and distribution qualifies as a transaction that is described in Sections 355(a) and 368(a)(1)(D) of the Code. The ruling and the opinion rely on certain facts, assumptions, representations and undertakings from Abbott and AbbVie regarding the past and future conduct of the companies' respective businesses and other matters. If any of these facts, assumptions, representations or undertakings are incorrect or not satisfied, Abbott and its shareholders may not be able to rely on the ruling or the opinion of tax counsel and could be subject to significant tax liabilities. Notwithstanding the receipt by Abbott of the private letter ruling from the IRS and opinion of tax counsel, the IRS could determine on audit that the separation is taxable if it determines that any of these facts, assumptions, representations or undertakings are not correct or have been violated or if it disagrees with the conclusions in the opinion that are not covered by the private letter ruling, or for other reasons, including as a result of certain significant changes in the share ownership of Abbott or AbbVie after the separation. If the separation is determined to be taxable for U.S. federal income tax purposes, Abbott and its shareholders that are subject to U.S. federal income tax could incur significant U.S. federal income tax liabilities. The international nature of Abbott's business subjects it to additional business risks that may cause its revenue and profitability to decline. Abbott's business is subject to risks associated with doing business internationally. Following the separation of AbbVie, sales outside of the United States are expected to make up approximately 70 percent of Abbott's net sales. The risks associated with Abbott's operations outside the United States include: •fluctuations in currency exchange rates; •changes in medical reimbursement policies and programs; •multiple regulatory requirements that are subject to change and that could restrict Abbott's ability to manufacture, market, and sell its products; •differing local product preferences and product requirements; •trade protection measures and import or export licensing requirements; •difficulty in establishing, staffing, and managing operations; •differing labor regulations; •potentially negative consequences from changes in or interpretations of tax laws; •political and economic instability, including sovereign debt issues; •price and currency exchange controls, limitations on participation in local enterprises, expropriation, nationalization, and other governmental action; •inflation, recession and fluctuations in interest rates; •compulsory licensing or diminished protection of intellectual property; and •potential penalties or other adverse consequences for violations of anti-corruption, anti-bribery and other similar laws and regulations, including the Foreign Corrupt Practices Act and the U.K. Bribery Act. Events contemplated by these risks may, individually or in the aggregate, have a material adverse effect on Abbott's revenues and profitability. Other factors can have a material adverse effect on Abbott's future profitability and financial condition. Many other factors can affect Abbott's profitability and its financial condition, including: •changes in or interpretations of laws and regulations, including changes in accounting standards, taxation requirements, product marketing application standards, product labeling, source, and use laws, and environmental laws; •differences between the fair value measurement of assets and liabilities and their actual value, particularly for pensions, retiree health care, stock compensation, intangibles, and goodwill; and for contingent liabilities such as litigation, the absence of a recorded amount, or an amount recorded at the minimum, compared to the actual amount; •changes in the rate of inflation (including the cost of raw materials, commodities, and supplies), interest rates, market value of Abbott's equity investments, and the performance of investments held by Abbott or Abbott's employee benefit trusts; •changes in the creditworthiness of counterparties that transact business with or provide services to Abbott or Abbott's employee benefit trusts; •changes in business, economic, and political conditions, including: war, political instability, terrorist attacks, the threat of future terrorist activity and related military action; natural disasters; the cost and availability of insurance due to any of the foregoing events; labor disputes, strikes, slow-downs, or other forms of labor or union activity; and pressure from third-party interest groups; •changes in Abbott's business units and investments and changes in the relative and absolute contribution of each to earnings and cash flow resulting from evolving business strategies, changing product mix, changes in tax laws or tax rates both in the U.S. and abroad and opportunities existing now or in the future; •changes in the buying patterns of a major distributor, retailer, or wholesale customer resulting from buyer purchasing decisions, pricing, seasonality, or other factors, or other problems with licensors, suppliers, distributors, and business partners; •changes in credit markets impacting Abbott's ability to obtain financing for its business operations; and •legal difficulties, any of which could preclude or delay commercialization of products or adversely affect profitability, including claims asserting statutory or regulatory violations, and adverse litigation decisions. CAUTIONARY STATEMENT REGARDING FORWARD-LOOKING STATEMENTS This Form 10-K contains forward-looking statements that are based on management's current expectations, estimates, and projections. Words such as \"expects,\" \"anticipates,\" \"intends,\" \"plans,\" \"believes,\" \"seeks,\" \"estimates,\" \"forecasts,\" variations of these words, and similar expressions are intended to identify these forward-looking statements. Certain factors, including but not limited to those identified under \"Item 1A. Risk Factors\" of this Form 10-K, may cause actual results to differ materially from current expectations, estimates, projections, forecasts, and from past results. No assurance can be made that any expectation, estimate, or projection contained in a forward-looking statement will be achieved or will not be affected by the factors cited above or other future events. Abbott undertakes no obligation to release publicly any revisions to forward-looking statements as the result of subsequent events or developments, except as required by law. ITEM 1B.\nITEM 1B. UNRESOLVED STAFF COMMENTS None. ITEM 2.\nITEM 2. PROPERTIES Abbott's corporate offices are located\n\n" - ] - } - ], + "outputs": [], "source": [ "print(\"Num examples:\", len(df))\n", "print(\"First example:\")\n", @@ -1099,8 +826,26 @@ " break \n", "\n", "if not integrity_check(temporary_mds_output_path): \n", - " raise ValueError(\"MDS has not been created correctly. There are missing shards!\")\n", - "\n", + " raise ValueError(\"MDS has not been created correctly. There are missing shards!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "da5f8305-6f00-484c-818c-5dcddcef0aef", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ "# Sanity Check\n", "import numpy as np\n", "from streaming import StreamingDataset\n", @@ -1148,21 +893,11 @@ "title": "" } }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset has ~985088 tokens that will be charged for during training\nBy default, you'll train for 3 epochs on this dataset\nBy default, you'll be charged for ~2955264 tokens\n" - ] - } - ], + "outputs": [], "source": [ "MAX_TOKENS_PER_EXAMPLE = FT_API_args.context_length if FT_API_args.context_length is not None else 4096\n", "TARGET_EPOCHS = FT_API_args.training_duration if FT_API_args.training_duration is not None else 1 \n", "n_epochs = TARGET_EPOCHS\n", - "n_train_examples = len(raw_dataset)\n", "\n", "n_billing_tokens_in_dataset = len(mds_dataset) * FT_API_args.context_length \n", "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n", @@ -1175,7 +910,10 @@ "execution_count": 0, "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "8775fed8-6440-4a20-82f3-59b6cff73421", "showTitle": false, From 8b75f946a353c311eb53e4116d07cb0514844e80 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 12 Jan 2024 10:01:15 -0800 Subject: [PATCH 62/63] remove scripts keep notebook --- .../data_prep/validate_and_tokenize_data.py | 731 ------------------ .../test_validate_and_tokenize_data.py | 131 ---- 2 files changed, 862 deletions(-) delete mode 100644 scripts/data_prep/validate_and_tokenize_data.py delete mode 100644 tests/a_scripts/data_prep/test_validate_and_tokenize_data.py diff --git a/scripts/data_prep/validate_and_tokenize_data.py b/scripts/data_prep/validate_and_tokenize_data.py deleted file mode 100644 index 3b6c109199..0000000000 --- a/scripts/data_prep/validate_and_tokenize_data.py +++ /dev/null @@ -1,731 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC Copyright 2022 MosaicML LLM Foundry authors. -# MAGIC SPDX-License-Identifier: Apache-2.0 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC JIRA: https://databricks.atlassian.net/jira/software/c/projects/STR/issues/STR-141?filter=allissues - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Warning: Important Alert Regarding the Script Usage -# MAGIC -# MAGIC ### Script Purpose: -# MAGIC - **Not for Training**: This script is not utilized during the training process. -# MAGIC - **Ad-Hoc Validation**: It serves as an ad-hoc utility for users to run independently prior to starting fine-tuning. -# MAGIC - **Data Verification**: Its primary function is to validate the user's data before they invoke the Fine-Tuning (FT) API. -# MAGIC - **Cost Estimation**: Users can estimate the cost implications with this script. -# MAGIC -# MAGIC ### Usage Scenario: -# MAGIC This script is particularly useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process. -# MAGIC -# MAGIC ### Note on Long-Term Solution: -# MAGIC - **Temporary Measure**: This script is a stop-gap solution. -# MAGIC - **Future Development**: We are in the process of developing a long-term data preparation service, which will eventually replace this script. -# MAGIC -# MAGIC ### Checks Include: -# MAGIC - check input dataset: -# MAGIC 1) verify if dataset input format is valid (need to be one of these: Huggingface, delta table, dbfs:/Volumes, cloud path); -# MAGIC - check HF input location: -# MAGIC 1) load dataset info and check if it is accessible; -# MAGIC 2) verify if the split exists. -# MAGIC - check cloud path location: -# MAGIC 1) check the cloud prefix is compliant with composers' object store supports (gs, s3, oci) -# MAGIC 2) check if list objects returns nothing. -# MAGIC - count_tokens: -# MAGIC 1) For IFT task: validate tokenization by running tokenizer + filter on the entire dataset. count the number of tokens. Throws error if there are any empty responses or prompts -# MAGIC 2) For CPT task: call donwload_text_to_mds.py and count the resulted mds dataset. Note this could take a long time. -# MAGIC -# MAGIC ### Questions: -# MAGIC - Is "download_text_to_mds.py" always callable from the validation script? -# MAGIC - what is the function to reuse to run tokenization on HF datasets with filters? -# MAGIC - The inputs to this validation script is assumed to be the same or a subset of the FT API arguments, i.e., a configuration like below. Is this a valid assumption? -# MAGIC ``` -# MAGIC cfg = { -# MAGIC model: str, -# MAGIC train_data_path: str, -# MAGIC save_folder: str, -# MAGIC *, -# MAGIC task_type: Optional[str] = "INSTRUCTION_FINETUNE", -# MAGIC eval_data_path: Optional[str] = None, -# MAGIC eval_prompts: Optional[List[str]] = None, -# MAGIC custom_weights_path: Optional[str] = None, -# MAGIC training_duration: Optional[str] = None, -# MAGIC learning_rate: Optional[float] = None, -# MAGIC context_length: Optional[int] = None, -# MAGIC experiment_trackers: Optional[List[Dict]] = None, -# MAGIC data_prep_config: Optional[Dict] = None, -# MAGIC disable_credentials_check: Optional[bool] = None, -# MAGIC timeout: Optional[float] = 10, -# MAGIC future: Literal[False] = False, -# MAGIC } -# MAGIC - What null checkings do we want to have? -# MAGIC - How to map the model to its expected eos_text / bos_text format? [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703644153357929?thread_ts=1703643155.904289&cid=C05K29T9NBF) -# MAGIC - How to automate tokenization for CPT? it is always really standard: sequence -> concat(tok(BOS), tok(sequence), tok(EOS)), and then concatenate sequences. [Ref](https://databricks.slack.com/archives/C05K29T9NBF/p1703698056000399?thread_ts=1703643155.904289&cid=C05K29T9NBF) -# MAGIC ``` - -# COMMAND ---------- - -# MAGIC %pip install llm-foundry - -# COMMAND ---------- - -# dbutils.library.restartPython() - -# COMMAND ---------- - -import os -import re -from argparse import ArgumentParser, Namespace -from typing import Tuple, Union - -from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, - parse_uri) -from datasets import get_dataset_split_names -from huggingface_hub import dataset_info -from omegaconf import OmegaConf as om - -from llmfoundry.utils import build_tokenizer - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## User Defines the Cell Below - -# COMMAND ---------- - -FT_API_args = Namespace( - model='EleutherAI/gpt-neox-20b', - train_data_path= - 'tatsu-lab/alpaca', # 'mosaicml/dolly_hhrlhf/train', # tatsu-lab/alpaca/train', - save_folder= - 'dbfs:/databricks/mlflow-tracking/EXPERIMENT_ID/RUN_ID/artifacts/checkpoints', - task_type='INSTRUCTION_FINETUNE', - eval_data_path=None, - eval_prompts=None, - custom_weights_path=None, - training_duration=None, - learning_rate=None, - context_length=2048, - experiment_trackers=None, - disable_credentials_check=None, - # Extra argument to add to FT API - # See comment https://databricks.atlassian.net/browse/STR-141?focusedCommentId=4308948 - data_prep_config={ - 'data_validation': True, - 'data_prep': False - }, - timeout=10, - future=False, -) - -os.environ['HF_ASSETS_CACHE'] = '/tmp/' -os.environ['HF_HOME'] = '/tmp/' -os.environ['HF_HUB_CACHE'] = '/tmp/' -os.environ['HF_DATASETS_CACHE'] = '/tmp/' - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Adapted from llmfoundry/scripts/data_prep/convert_text_to_mds.py - -# COMMAND ---------- - -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 - -# Taken from llmfoundry/scripts/data_prep/convert_text_to_mds.py - -import logging -import math -import tempfile -from argparse import Namespace -from concurrent.futures import ProcessPoolExecutor -from glob import glob -from typing import Iterable, List, Tuple, cast - -from composer.utils import (ObjectStore, maybe_create_object_store_from_uri, - parse_uri) -from streaming import MDSWriter -from tqdm import tqdm -from transformers import AutoTokenizer - -from llmfoundry.data import ConcatTokensDataset -from llmfoundry.utils.data_prep_utils import (DownloadingIterable, - merge_shard_groups) - -log = logging.getLogger(__name__) -DONE_FILENAME = '.text_to_mds_conversion_done' - - -def parse_args( - tokenizer: str, - concat_tokens: int, - output_folder: str, - input_folder: str, - compression: str = 'zstd', - bos_text: str = '', - eos_text: str = '', - no_wrap: bool = False, - processes: int = 32, # min(max(psutil.cpu_count() - 2, 1), 32), - reprocess: bool = False -) -> Namespace: - - parser = ArgumentParser( - description= - 'Convert text files into MDS format, optionally concatenating and tokenizing', - ) - parsed = Namespace(tokenizer=tokenizer, - concat_tokens=concat_tokens, - output_folder=output_folder, - input_folder=input_folder, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - compression=compression, - processes=processes, - reprocess=reprocess) - - # Make sure we have needed concat options - if (parsed.concat_tokens is not None and - isinstance(parsed.concat_tokens, int) and parsed.tokenizer is None): - parser.error( - 'When setting --concat_tokens, you must specify a --tokenizer') - - # now that we have validated them, change BOS/EOS to strings - if parsed.bos_text is None: - parsed.bos_text = '' - if parsed.eos_text is None: - parsed.eos_text = '' - return parsed - - -def get_object_names(input_folder: str) -> List[str]: - """Get object names from a local or remote folder. - - Args: - input_folder (str): local or remote folder path. - """ - object_store = maybe_create_object_store_from_uri(input_folder) - if object_store is not None: - _, _, folder_prefix = parse_uri(input_folder) - names = [ - name for name in object_store.list_objects(folder_prefix) - if name.endswith('.txt') - ] - else: - # input_folder is a local folder - names = [ - text_file for dirpath, _, _ in os.walk(input_folder) - for text_file in glob(os.path.join(dirpath, '*.txt')) - ] - # return names, sizes - log.info(f'Found {len(names)} text files at {input_folder}') - - return names - - -def get_task_args( - object_names: List[str], - output_root: str, - input_folder: str, - n_groups: int, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, -) -> Iterable: - """Get download_and_convert arguments split across n_groups. - - Each group handles a portion of object_names. - - Args: - object_names (List[str]): Names of objects to process - output_root (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - n_groups (int): Number of groups to split the object names into - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - """ - num_objects = len(object_names) - objs_per_group = math.ceil(num_objects / n_groups) - for group, i in enumerate(range(0, num_objects, objs_per_group)): - output_subdir = os.path.join(output_root, str(group)) - yield ( - object_names[i:min(i + objs_per_group, num_objects)], - output_subdir, - input_folder, - tokenizer_name, - concat_tokens, - eos_text, - bos_text, - no_wrap, - compression, - ) - - -def download_and_convert_starargs(args: Tuple): - """Helper function to call download_and_convert with star args. - - This helps us use download_and_convert with mutiprocessing. - """ - return download_and_convert(*args) - - -def download_and_convert( - file_names: List[str], - output_folder: str, - input_folder: str, - tokenizer_name: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, -): - """Downloads and converts text fies to MDS format. - - Args: - file_names (List[str]): Files to process - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - tokenizer_name (str): Name of tokenizer to use - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - """ - object_store = maybe_create_object_store_from_uri(input_folder) - - # Download file_names - with tempfile.TemporaryDirectory() as tmp_dir: - downloading_iter = DownloadingIterable(object_names=file_names, - output_folder=tmp_dir, - object_store=object_store) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - tokenizer.model_max_length = 5000000000 # Hack to prevent warnings from HuggingFace - - # Use the ConcatTokensDataset from LLM-foundry to concatenate sequences of tokens up - # to the maximum sequence length - dataset = ConcatTokensDataset( - hf_dataset=downloading_iter, - max_length=concat_tokens, - tokenizer=tokenizer, - eos_text=eos_text, - bos_text=bos_text, - no_wrap=no_wrap, - ) - - columns = {'tokens': 'bytes'} - - log.info('Converting to MDS format...') - with MDSWriter(out=output_folder, - columns=columns, - compression=compression) as out: - for sample in tqdm(dataset): - out.write(sample) - - -def is_remote_path(path: str) -> bool: - """Checks whether a path is a remote path. - - Args: - path (str): path to check - """ - backend, _, _ = parse_uri(path) - return backend != '' - - -def is_already_processed(output_root: str, args_str: str, - object_names: List[str]) -> bool: - """Determines whether a group of text files has already been processed. - - Checks the done fie at output root to determine this. - - Args: - output_root (str): Output folder where a done file may exist - args_str (str): String representation of the arguments - object_names (List[str]): Names of objects to convert to MDS format - """ - # Retrieve the done file contents - output_object_store = maybe_create_object_store_from_uri(output_root) - if output_object_store is not None: - # Download and read the done file from the remote object store - _, _, output_folder_prefix = parse_uri(output_root) - try: - with tempfile.TemporaryDirectory() as tmp_dir: - done_file = os.path.join(tmp_dir, DONE_FILENAME) - output_object_store.download_object( - os.path.join(output_folder_prefix, DONE_FILENAME), - done_file) - with open(done_file) as df: - done_file_contents = df.read().splitlines() - except FileNotFoundError: - return False - else: - # Read the local done file - done_file = os.path.join(output_root, DONE_FILENAME) - if not os.path.isfile(done_file): - return False - with open(done_file) as df: - done_file_contents = df.read().splitlines() - # Compare the arguments - prev_args_str = done_file_contents[0] - if prev_args_str != args_str: - return False - - # Compare file names - prev_names = done_file_contents[1:] - if len(prev_names) != len(object_names): - return False - for idx, prev_name in enumerate(prev_names): - if object_names[idx] != prev_name: - return False - return True - - -def write_done_file(folder: str, args_str: str, object_names: List[str]): - """Write a file to signify completion. - - This the done file includes the arguments to processing and - a list of objects that were processed. - - Args: - folder (str): Folder to write the done file to - args_str (str): String representation of arguments - object_names (List[str]): List of objects to convert to MDS format - """ - with open(os.path.join(folder, DONE_FILENAME), 'w') as done_file: - done_file.write('\n'.join([args_str] + object_names) + '\n') - - -def convert_text_to_mds( - tokenizer_name: str, - output_folder: str, - input_folder: str, - concat_tokens: int, - eos_text: str, - bos_text: str, - no_wrap: bool, - compression: str, - processes: int, - args_str: str, - reprocess: bool, -): - """Convert a folder of text files to MDS format. - - Args: - tokenizer_name (str): Name of tokenizer to use - output_folder (str): Folder to write MDS shards to - input_folder (str): Folder of text files to process - concat_tokens (int): Concantenate up to this many tokens - eos_text (str): Textend to append to each example to separate concatenated samples - bos_text (str): Text to prepend to each example to separate concatenated samples - no_wrap: (bool): Whether to let text examples wrap across multiple training examples - compression (str): The compression algorithm to use for MDS writing - processes (int): The number of processes to use. - args_str (str): String representation of the arguments - reprocess (bool): Whether to always reprocess the given folder of text files - """ - is_remote_output = is_remote_path(output_folder) - - object_names = get_object_names(input_folder) - if len(object_names) == 0: - raise ValueError(f'No text files were found at {input_folder}.') - - # Check if the text files in the bucket have already been processed. - if not reprocess and is_already_processed(output_folder, args_str, - object_names): - log.info( - f'Input folder {input_folder} is already processed at {output_folder} and ' - + - 'reprocess is set to False. Set reprocess to True if you would like to force reprocessing.' - ) - return - - # Use a temporary local directory if the output is remote and there are more than 1 processes - local_output_folder = tempfile.TemporaryDirectory( - ).name if is_remote_output else output_folder - - if processes > 1: - # Download and convert the text files in parallel - args = get_task_args(object_names, local_output_folder, input_folder, - processes, tokenizer_name, concat_tokens, eos_text, - bos_text, no_wrap, compression) - with ProcessPoolExecutor(max_workers=processes) as executor: - list(executor.map(download_and_convert_starargs, args)) - - # Merge the mds shards from each of the processes into a single folder - merge_shard_groups(local_output_folder) - else: - download_and_convert(object_names, local_output_folder, input_folder, - tokenizer_name, concat_tokens, eos_text, bos_text, - no_wrap, compression) - - # Write a done file with the args and object names - write_done_file(local_output_folder, args_str, object_names) - - if is_remote_output: - # Upload the local output to the remote location - output_object_store = cast( - ObjectStore, maybe_create_object_store_from_uri(output_folder)) - _, _, output_folder_prefix = parse_uri(output_folder) - files_to_upload = os.listdir(local_output_folder) - - for file in files_to_upload: - assert not os.path.isdir(file) - remote_path = os.path.join(output_folder_prefix, file) - output_object_store.upload_object( - remote_path, os.path.join(local_output_folder, file)) - - -def _args_str(original_args: Namespace) -> str: - """Create a string from the args to determine whether to reprocess. - - Args: - original_args (Namespace): Arguments to main function. - """ - # Take the arguments that influence the final result. - # reprocess and max_mds_writer_workers are not taken. - args = Namespace( - tokenizer_name=original_args.tokenizer, - output_folder=original_args.output_folder, - input_folder=original_args.input_folder, - concat_tokens=original_args.concat_tokens, - eos_text=original_args.eos_text, - bos_text=original_args.bos_text, - no_wrap=original_args.no_wrap, - compression=original_args.compression, - processes=original_args.processes, - ) - - return str(args) - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Validate Inputs and Count tokens - -# COMMAND ---------- - -import json - -from streaming.base.storage.download import download_file -from streaming.base.storage.upload import CloudUploader - - -def integrity_check(out: Union[str, Tuple[str, str]]): - """Check if the index file has integrity. - - If index is a cloud url, first download it to a temp local file. - - Args: - out (Union[str, Tuple[str,str]]): MDS dataset path - """ - - def count_shards(mds_root: str): - n_shard_files = 0 - cu = CloudUploader.get(mds_root, exist_ok=True, keep_local=True) - for o in cu.list_objects(): - if o.endswith('.mds'): - n_shard_files += 1 - return n_shard_files - - cu = CloudUploader.get(out, keep_local=True, exist_ok=True) - - with tempfile.TemporaryDirectory() as temp_dir: - if cu.remote: - download_file(os.path.join(cu.remote, 'index.json'), - os.path.join(temp_dir, 'index.json'), - timeout=60) - actual_n_shard_files = count_shards(cu.remote) - local_merged_index_path = os.path.join(temp_dir, 'index.json') - else: - local_merged_index_path = os.path.join(cu.local, 'index.json') - actual_n_shard_files = count_shards(cu.local) - - merged_index = json.load(open(local_merged_index_path, 'r')) - n_shard_files = len( - {b['raw_data']['basename'] for b in merged_index['shards']}) - return n_shard_files == actual_n_shard_files - - -def check_HF_datasets(dataset_names_with_splits: list): - token = os.environ.get('HUGGING_FACE_HUB_TOKEN') - for dataset_name_with_split in dataset_names_with_splits: - dataset_name, split = os.path.split(dataset_name_with_split) - # make sure we have a dataset and split - if not dataset_name or not split: - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that you include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." - # check user access to the dataset - try: - _ = dataset_info(dataset_name) - except: - token_warning = '' - if not token: - token_warning = ' If this is a private dataset, please set your HUGGING_FACE_HUB_TOKEN using: mcli create secret hf.' - return False, f"Failed to load Hugging Face dataset {dataset_name_with_split}. Please ensure that the dataset exists and that you have access to it. Remember to include the split name (e.g. 'mosaicml/dolly_hhrlhf/train')." + token_warning - # check that split exists - try: - splits = get_dataset_split_names(dataset_name) - except: # error raised in the case of multiple subsets - return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Please make sure that the split is valid and that your dataset does not have subsets.' - if split not in splits: - return False, f'Failed to load Hugging Face dataset {dataset_name_with_split}. Split not found.' - return True, '' - - -def is_hf_dataset_path(path: str): - """Check if a given string is a dataset path used by Hugging Face. - - Args: - path (str): The string to be checked. - - Returns: - bool: True if the string is a dataset path, False otherwise. - """ - # Regular expression to match the dataset path pattern - pattern = r'^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+/?(train|validation|test)?/?$' - - return bool(re.match(pattern, path)) - - -def create_om_cfg(FT_API_args: Namespace): - task_type = FT_API_args.task_type - train_data_path = FT_API_args.train_data_path - model = FT_API_args.model - max_seq_len = FT_API_args.context_length - - common_args = { - 'drop_last': False, - 'num_workers': 2, - 'prefetch_factor': 2, - 'pin_memory': False, - 'persistent_workers': False, - 'timeout': 0 - } - if task_type == 'INSTRUCTION_FINETUNE': - cfg = om.create({ - 'dataset': { - 'hf_name': train_data_path, - 'split': 'train', - 'max_seq_len': max_seq_len, - 'decoder_only_format': True, - 'allow_pad_trimming': False, - 'shuffle': True, - }, - **common_args - }) - - else: - cfg = om.create({ - 'name': 'finetuning', - 'dataset': { - 'remote': train_data_path, - 'local': train_data_path, - 'split': 'train', - 'max_seq_len': max_seq_len, - 'decoder_only_format': True, - 'allow_pad_trimming': False, - 'packing_ratio': None, - 'shuffle': True, - }, - **common_args - }) - - tokenizer = build_tokenizer( - tokenizer_name=model, - tokenizer_kwargs={'model_max_length': max_seq_len}, - ) - - return cfg, tokenizer - - -# COMMAND ---------- - - -# build cfg from the inputs -def main(): - if FT_API_args.task_type == 'INSTRUCTION_FINETUNE': - # check if train_data_path is a valid HF dataset url with splits. - if not is_hf_dataset_path(FT_API_args.train_data_path): - raise ValueError( - f'Input path {FT_API_args.train_data_path} is not supported. It needs to be a valid Huggingface dataset path.' - ) - # load dataset.info and see if HF tokens are correctly set. - check_HF_datasets(FT_API_args.train_data_path) - - cfg, tokenizer = create_om_cfg(FT_API_args) - - elif FT_API_args.task_type == 'CONTINUED_PRETRAIN': - # check if train_data_path is a valid object store that composer supports - cfg, tokenizer = create_om_cfg(FT_API_args) - - input_folder = FT_API_args.train_data_path - output_folder = FT_API_args.save_folder - concat_tokens = FT_API_args.context_length - tokenizer_name = FT_API_args.model - - # Run convert_text_to_mds.py and dump MDS dataset to "save_folder" - args = parse_args(tokenizer, concat_tokens, output_folder, input_folder) - convert_text_to_mds(tokenizer_name=args.tokenizer, - output_folder=args.output_folder, - input_folder=args.input_folder, - concat_tokens=args.concat_tokens, - eos_text=args.eos_text, - bos_text=args.bos_text, - no_wrap=args.no_wrap, - compression=args.compression, - processes=args.processes, - reprocess=args.reprocess, - args_str=_args_str(args)) - - # Check if the MDS dataset is integral by checking index.json - if integrity_check(args.output_folder): - raise RuntimeError( - f'{args.output_folder} has mismatched number of shard files between merged index.json and actual shards!' - ) - - print('Converted data for continnued pre-training was saved in: ', - args.output_folder) - - else: - raise ValueError( - f'task_type can only be INSTRUCTION_FINETUNE or Continued_Pretraining but got {FT_API_args.task_type} instead!' - ) - # Run a few checks on resulted MDS datasets - # 1. no shards in output_folder - # 2. check shard completeness by downloading and inspecting index.json - - from llmfoundry.data.finetuning import build_finetuning_dataloader - tokenizer_name = 'EleutherAI/gpt-neox-20b' - tokenizer_kwargs = {'model_max_length': cfg.dataset.max_seq_len} - tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - - device_batch_size = 1 - dataspec = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) - dataloader = dataspec.dataloader - token_counting_func = dataspec.get_num_tokens_in_batch - - total_tokens = 0 - for batch in dataloader: - total_tokens += token_counting_func(batch) - - print('Total number of tokens:', total_tokens) - - -# COMMAND ---------- - -if __name__ == '__main__': - main() diff --git a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py b/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py deleted file mode 100644 index 8a78581fef..0000000000 --- a/tests/a_scripts/data_prep/test_validate_and_tokenize_data.py +++ /dev/null @@ -1,131 +0,0 @@ -# Copyright 2022 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 -from argparse import Namespace -from typing import Any -from unittest.mock import MagicMock, mock_open, patch - -from transformers import AutoTokenizer - -from scripts.data_prep.validate_and_tokenize_data import (check_HF_datasets, - create_om_cfg, - integrity_check, - is_hf_dataset_path) - - -class MockCloudUploader: - - def __init__(self): - self.remote = 'some_remote_path' - self.local = 'some_local_path' - - def list_objects(self): - return ['shard1.mds', 'shard2.mds'] - - -class MockDatasetInfo: - - def __init__(self): - self.id = 'valid_dataset' - self.description = 'A mock dataset description' - - -@patch('scripts.data_prep.validate_and_tokenize_data.CloudUploader.get') -@patch('scripts.data_prep.validate_and_tokenize_data.download_file') -@patch('scripts.data_prep.validate_and_tokenize_data.json.load') -@patch( - 'builtins.open', - new_callable=mock_open, - read_data= - '{"shards": [{"raw_data": {"basename": "shard1.mds"}}, {"raw_data": {"basename": "shard2.mds"}}]}' -) -def test_integrity_check(mock_file_open: Any, mock_json_load: Any, - mock_download_file: Any, mock_cloud_uploader: Any): - # Setup mocks - mock_cloud_uploader.return_value = MockCloudUploader() - mock_json_load.return_value = { - 'shards': [{ - 'raw_data': { - 'basename': 'shard1.mds' - } - }, { - 'raw_data': { - 'basename': 'shard2.mds' - } - }] - } - - # Test case where integrity is valid - assert integrity_check('mock_dataset_path') - - # Test case where integrity is invalid - # Modify the mock to simulate a different scenario - mock_json_load.return_value = { - 'shards': [{ - 'raw_data': { - 'basename': 'shard1.mds' - } - }] - } # less shards than expected - assert not integrity_check('mock_dataset_path') - - -# Additional tests can be written for cases like remote URL, file not found, etc. - - -@patch('scripts.data_prep.validate_and_tokenize_data.dataset_info') -@patch('scripts.data_prep.validate_and_tokenize_data.get_dataset_split_names') -def test_check_HF_datasets(mock_get_splits: Any, mock_dataset_info: Any): - # Setup mocks - mock_get_splits.return_value = ['train', 'test'] - mock_dataset_info.return_value = MockDatasetInfo() - - # Test valid dataset with valid split - result, _ = check_HF_datasets(['valid_dataset/train']) - assert result - - # Test valid dataset with invalid split - result, _ = check_HF_datasets(['valid_dataset/invalid_split']) - assert not result - - # Test invalid dataset - mock_dataset_info.side_effect = Exception('Dataset not found') - result, _ = check_HF_datasets(['invalid_dataset/train']) - assert not result - - -# Additional tests for private datasets, token issues, etc. - - -def test_is_hf_dataset_path(): - # Valid dataset paths - assert is_hf_dataset_path('user/dataset/train') - assert is_hf_dataset_path('user/dataset') - - # Invalid dataset paths - assert not is_hf_dataset_path('user@dataset/train') - assert not is_hf_dataset_path('just_dataset_name') - assert not is_hf_dataset_path('user/dataset/unknown_split/') - - -@patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_instruction_finetune(mock_from_pretrained: Any): - mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace(task_type='INSTRUCTION_FINETUNE', - train_data_path='hf_dataset/train', - model='model_name', - context_length=512) - cfg, _ = create_om_cfg(args) - assert cfg.dataset.hf_name == 'hf_dataset/train' - assert cfg.dataset.max_seq_len == 512 - - -@patch('transformers.AutoTokenizer.from_pretrained') -def test_create_om_cfg_continued_pretrain(mock_from_pretrained: Any): - mock_from_pretrained.return_value = MagicMock(spec=AutoTokenizer) - args = Namespace(task_type='CONTINUED_PRETRAIN', - train_data_path='object_store_path', - model='model_name', - context_length=512) - cfg, _ = create_om_cfg(args) - assert cfg.dataset.remote == 'object_store_path' - assert cfg.dataset.max_seq_len == 512 From 22014d678488e1dcd2f2f88331e39df5ac5449ac Mon Sep 17 00:00:00 2001 From: xiaohanzhan-db Date: Fri, 12 Jan 2024 18:58:50 +0000 Subject: [PATCH 63/63] update notebook. rephrase. --- notebooks/validate_and_tokenize_data.ipynb | 90 +++++++++++++++++----- 1 file changed, 69 insertions(+), 21 deletions(-) diff --git a/notebooks/validate_and_tokenize_data.ipynb b/notebooks/validate_and_tokenize_data.ipynb index 8d974cc479..0aef0bf6db 100644 --- a/notebooks/validate_and_tokenize_data.ipynb +++ b/notebooks/validate_and_tokenize_data.ipynb @@ -4,7 +4,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "f275a21b-47d4-472c-972b-e2a84a597db2", "showTitle": false, @@ -12,7 +15,7 @@ } }, "source": [ - "# FM FT API: Validation and Cost Estimation\n", + "# FM FT API: Data Validation and \\$Token Estimation\n", "\n", "#### Usage Scenario:\n", "This notebook goes hand-in-hand with Databricks-Mosaicml's FT API. Our customers may find it useful in scenarios where there is a risk of data being malformed. It acts as a preventive measure to ensure data integrity and helps in cost assessment for the fine-tuning process.\n", @@ -55,7 +58,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "3d08a21c-9f5a-4ad2-af85-e016335cc53d", "showTitle": false, @@ -188,7 +194,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "3a513cdd-967d-4a87-b56f-340053fa79cd", "showTitle": false, @@ -203,7 +212,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "cfebdfdf-b87c-4a77-b97c-4697566a55fa", "showTitle": false, @@ -251,7 +263,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "39c45005-1a77-4162-b9e4-bd8df6f5ec69", "showTitle": false, @@ -327,7 +342,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "06d46367-bd32-473a-9f16-1b34a8dd9356", "showTitle": false, @@ -400,7 +418,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "9713a0ce-80f4-4187-b10b-4223b17fe4c1", "showTitle": false, @@ -408,7 +429,7 @@ } }, "source": [ - "#### Cost Estimation\n", + "#### Token Estimation\n", "\n", "Tokenize the raw dataset and we see some statistics of the tokens and estimate the overall cost based on default trainining duration" ] @@ -454,7 +475,7 @@ "source": [ "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n", "print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n", - "print(f\"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens\")\n", + "print(f\"By default, ~{n_epochs * n_billing_tokens_in_dataset} tokens will be used in training\")\n", "plot_hist(pd.Series(batch_tokens['ntokens']))" ] }, @@ -484,7 +505,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "6699f47f-9b53-47da-95c0-b862c5826d0a", "showTitle": false, @@ -499,7 +523,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "dd37fdce-62d0-493e-bfa9-d823634b2a0d", "showTitle": false, @@ -562,7 +589,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "c21e7d1b-db34-4e5d-b6d9-190dc75170d3", "showTitle": false, @@ -585,7 +615,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "b29a4a37-c2a0-4a18-8dcb-d9d29d68d683", "showTitle": false, @@ -627,7 +660,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "830ad419-e844-4ae0-8348-167ea4b66f6b", "showTitle": false, @@ -672,7 +708,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "3fbc7944-9b41-49d3-98d6-6eb91425d1ba", "showTitle": false, @@ -728,7 +767,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "fb27026e-5f1e-453f-983d-8909f8999892", "showTitle": false, @@ -743,7 +785,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "ef494943-791e-44c1-87f3-92e022eb480a", "showTitle": false, @@ -819,7 +864,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "298eb990-9160-4e1b-958f-33dd2c11b54b", "showTitle": false, @@ -827,7 +875,7 @@ } }, "source": [ - "#### Cost Estimation" + "#### Token Estimation" ] }, { @@ -854,7 +902,7 @@ "n_billing_tokens_in_dataset = len(mds_dataset) * FT_API_args.context_length \n", "print(f\"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training\")\n", "print(f\"By default, you'll train for {n_epochs} epochs on this dataset\")\n", - "print(f\"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens\")" + "print(f\"By default, ~{n_epochs * n_billing_tokens_in_dataset} tokens will be used in training\")" ] }, {