From 774c62ebcf07027e2305f540db6e919d0d0ce121 Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Wed, 15 Nov 2023 18:00:54 -0800 Subject: [PATCH 01/11] Add eval loader to eval script --- llmfoundry/utils/builders.py | 25 +++++++++++++++++++++++++ scripts/eval/eval.py | 12 +++++++++++- scripts/train/train.py | 17 +++-------------- 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index dedf6f5434..d0f12f2eeb 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -31,6 +31,7 @@ HuggingFaceCheckpointer, LayerFreezing, MonolithicCheckpointSaver, ScheduledGarbageCollector) +from llmfoundry.data.dataloader import build_dataloader from llmfoundry.optim import (DecoupledAdaLRLion, DecoupledClipLion, DecoupledLionW, DecoupledLionW_8bit) from llmfoundry.optim.scheduler import InverseSquareRootWithWarmupScheduler @@ -39,6 +40,30 @@ log = logging.getLogger(__name__) +def build_eval_loader( + eval_loader_config: DictConfig, + tokenizer: PreTrainedTokenizerBase, + device_eval_batch_size: int, +) -> Evaluator: + evaluators = [] + + is_multi_eval = isinstance(eval_loader_config, ListConfig) + eval_configs = eval_loader_config if is_multi_eval else [eval_loader_config] + for eval_config in eval_configs: + eval_dataloader = build_dataloader(eval_config, tokenizer, + device_eval_batch_size) + + # For training, metrics are added after the model is created + # For eval, we'll use Evaluator's default, which is to use what's + # returned by model.get_metrics() + eval_loader = Evaluator( + label=f'eval/{eval_config.label}' if is_multi_eval else 'eval', + dataloader=eval_dataloader, + ) + evaluators.append(eval_loader) + return evaluators + + def build_icl_data_and_gauntlet( icl_tasks_config: Union[str, ListConfig], eval_gauntlet_config: Optional[Union[str, DictConfig]], diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 02a5d1f862..7f27e05329 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -21,7 +21,8 @@ from llmfoundry.models import MPTForCausalLM from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY -from llmfoundry.utils.builders import (build_icl_data_and_gauntlet, +from llmfoundry.utils.builders import (build_eval_loader, + build_icl_data_and_gauntlet, build_logger, build_tokenizer) from llmfoundry.utils.config_utils import pop_config, process_init_device @@ -100,6 +101,7 @@ def evaluate_model( max_seq_len: int, device_eval_batch_size: int, eval_gauntlet_config: Optional[Union[str, DictConfig]], + eval_loader_config: Optional[Union[DictConfig, ListConfig]], fsdp_config: Optional[Dict], num_retries: int, loggers_cfg: Dict[str, Any], @@ -122,6 +124,11 @@ def evaluate_model( icl_tasks, eval_gauntlet_config, tokenizer, device_eval_batch_size, max_seq_len, icl_subset_num_batches) + if eval_loader_config is not None: + loader_evaluators = build_eval_loader(eval_loader_config, tokenizer, + device_eval_batch_size) + evaluators.extend(loader_evaluators) + callbacks = [] if eval_gauntlet_callback is not None: callbacks.append(eval_gauntlet_callback) @@ -228,6 +235,8 @@ def main(cfg: DictConfig): default_value='debug') # Optional Evaluation Parameters with default values + eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config( + cfg, 'eval_loader', must_exist=False, default_value=None) seed: int = pop_config(cfg, 'seed', must_exist=False, default_value=17) dist_timeout: Union[float, int] = pop_config(cfg, 'dist_timeout', @@ -285,6 +294,7 @@ def main(cfg: DictConfig): max_seq_len=max_seq_len, device_eval_batch_size=device_eval_batch_size, eval_gauntlet_config=eval_gauntlet_config, + eval_loader_config=eval_loader_config, fsdp_config=fsdp_config, num_retries=num_retries, loggers_cfg=loggers_cfg, diff --git a/scripts/train/train.py b/scripts/train/train.py index 88f776375f..e9a91b4536 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -27,6 +27,7 @@ MPTForCausalLM) from llmfoundry.data.dataloader import build_dataloader from llmfoundry.utils.builders import (build_algorithm, build_callback, + build_eval_loader, build_icl_data_and_gauntlet, build_logger, build_optimizer, build_scheduler, build_tokenizer) @@ -529,22 +530,10 @@ def main(cfg: DictConfig) -> Trainer: evaluators = [] eval_loaders = [] if eval_loader_config is not None: - is_multi_eval = isinstance(eval_loader_config, ListConfig) - eval_configs = eval_loader_config if is_multi_eval else [ - eval_loader_config - ] - for eval_config in eval_configs: - eval_dataloader = build_dataloader(eval_config, tokenizer, - device_eval_batch_size) - eval_loader = Evaluator( - label=f'eval/{eval_config.label}' if is_multi_eval else 'eval', - dataloader=eval_dataloader, - metric_names=[], # we will add these after model is created - ) - eval_loaders.append(eval_loader) + eval_loaders = build_eval_loader(eval_loader_config, tokenizer, + device_eval_batch_size) eval_gauntlet_callback = None - if icl_tasks_config is not None: icl_evaluators, _, eval_gauntlet_callback = build_icl_data_and_gauntlet( icl_tasks_config, eval_gauntlet_config, tokenizer, From 6c7af3126dd5fbd9811ad50230e17d340b195600 Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Thu, 16 Nov 2023 09:43:47 -0800 Subject: [PATCH 02/11] small input tests --- llmfoundry/data/dataloader.py | 4 +++- tests/test_eval_inputs.py | 1 + tests/test_train_inputs.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 12741717be..35806ca022 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -41,4 +41,6 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size, ) else: - raise ValueError(f'Not sure how to build dataloader with config: {cfg}') + raise ValueError( + 'Expected dataloader name "text", "text_denoising", or "finetuning"' + f' but found name "{cfg.name}" in config: {cfg}') diff --git a/tests/test_eval_inputs.py b/tests/test_eval_inputs.py index 9c7a130a9b..83104b62b7 100644 --- a/tests/test_eval_inputs.py +++ b/tests/test_eval_inputs.py @@ -57,6 +57,7 @@ def test_optional_mispelled_params_raise_warning(self, 'loggers', 'eval_gauntlet', 'fsdp_config', + 'eval_loader', ] old_cfg = copy.deepcopy(cfg) for param in optional_params: diff --git a/tests/test_train_inputs.py b/tests/test_train_inputs.py index bf90f48ef0..2ed1c9c239 100644 --- a/tests/test_train_inputs.py +++ b/tests/test_train_inputs.py @@ -103,7 +103,7 @@ def test_optional_mispelled_params_raise_warning(self, 'save_folder', 'fsdp_config', 'lora_config', - 'eval_loader_config', + 'eval_loader', 'icl_tasks_config', ] old_cfg = copy.deepcopy(cfg) From 13cbc13e7bced821708bc421fce9c07b15980cef Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Mon, 27 Nov 2023 12:52:37 -0800 Subject: [PATCH 03/11] updates --- llmfoundry/data/dataloader.py | 42 ++++++++++++++------------------ llmfoundry/utils/builders.py | 18 ++++++++------ scripts/eval/eval.py | 38 ++++++++++++++++++----------- scripts/train/train.py | 46 +++++++++++++++-------------------- tests/test_dataloader.py | 25 +++++++++++++++++++ 5 files changed, 97 insertions(+), 72 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 35806ca022..71d31b6808 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -11,9 +11,18 @@ from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader from llmfoundry.data.text_data import build_text_dataloader +LOADER_NAME_TO_FUNCTION = { + 'text': build_text_dataloader, + 'text_denoising': build_text_denoising_dataloader, + 'finetuning': build_finetuning_dataloader, +} -def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - device_batch_size: int) -> DataSpec: + +def build_dataloader( + cfg: DictConfig, + tokenizer: PreTrainedTokenizerBase, + device_batch_size: int, +) -> DataSpec: """Builds a dataloader from a config. Args: @@ -22,25 +31,10 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size (int): The size of the batches (number of examples) that the dataloader will produce. """ - if cfg.name == 'text': - return build_text_dataloader( - cfg, - tokenizer, - device_batch_size, - ) - elif cfg.name == 'text_denoising': - return build_text_denoising_dataloader( - cfg, - tokenizer, - device_batch_size, - ) - elif cfg.name == 'finetuning': - return build_finetuning_dataloader( - cfg, - tokenizer, - device_batch_size, - ) - else: - raise ValueError( - 'Expected dataloader name "text", "text_denoising", or "finetuning"' - f' but found name "{cfg.name}" in config: {cfg}') + + if cfg.name not in LOADER_NAME_TO_FUNCTION: + allowed = ", ".join(LOADER_NAME_TO_FUNCTION.keys()) + raise ValueError(f'Expected dataloader name to be one of {allowed}' + + f' but found name "{cfg.name}" in config: {cfg}') + + return LOADER_NAME_TO_FUNCTION[cfg.name](cfg, tokenizer, device_batch_size) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index d0f12f2eeb..d2ea7f46c6 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -27,6 +27,7 @@ from torch.optim.optimizer import Optimizer from transformers import AutoTokenizer, PreTrainedTokenizerBase +from llmfoundry import ComposerHFCausalLM from llmfoundry.callbacks import (EvalGauntlet, FDiffMetrics, GlobalLRScaling, HuggingFaceCheckpointer, LayerFreezing, MonolithicCheckpointSaver, @@ -42,9 +43,13 @@ def build_eval_loader( eval_loader_config: DictConfig, + model: Union[Any, ComposerHFCausalLM], tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, ) -> Evaluator: + assert model.train_metrics is not None + eval_metric_names = list(model.train_metrics.keys()) + evaluators = [] is_multi_eval = isinstance(eval_loader_config, ListConfig) @@ -52,13 +57,10 @@ def build_eval_loader( for eval_config in eval_configs: eval_dataloader = build_dataloader(eval_config, tokenizer, device_eval_batch_size) - - # For training, metrics are added after the model is created - # For eval, we'll use Evaluator's default, which is to use what's - # returned by model.get_metrics() eval_loader = Evaluator( label=f'eval/{eval_config.label}' if is_multi_eval else 'eval', dataloader=eval_dataloader, + metric_names=eval_metric_names, ) evaluators.append(eval_loader) return evaluators @@ -218,8 +220,8 @@ def build_tokenizer( signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup' - if dist.is_available() and dist.is_initialized( - ) and dist.get_world_size() > 1: + if dist.is_available() and dist.is_initialized() and dist.get_world_size( + ) > 1: # Make sure the tokenizer files are downloaded and cached first by local rank 0 with dist.local_rank_zero_download_and_wait(signal_file_path): pass @@ -238,8 +240,8 @@ def build_tokenizer( int(1e30), ) - if dist.is_available() and dist.is_initialized( - ) and dist.get_world_size() > 1: + if dist.is_available() and dist.is_initialized() and dist.get_world_size( + ) > 1: if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_tokenizer_setup') diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 7f27e05329..ef54e3234a 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -28,7 +28,7 @@ def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - num_retries: int) -> Optional[ComposerModel]: + num_retries: int) -> ComposerModel: try: from peft import PeftModel except ImportError as e: @@ -44,7 +44,8 @@ def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, } retries = 0 - while retries < num_retries: + composer_model_wrapper = None + while retries < num_retries and composer_model_wrapper is None: try: trust_remote_code = model_cfg.get('trust_remote_code', True) use_auth_token = model_cfg.get('use_auth_token', False) @@ -59,7 +60,6 @@ def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, composer_model_wrapper = COMPOSER_MODEL_REGISTRY[model_cfg.name]( peft_model, tokenizer) - return composer_model_wrapper except Exception as e: retries += 1 if retries >= num_retries: @@ -69,19 +69,21 @@ def load_peft_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, f'Got exception {str(e)} while loading model {model_cfg.name}. {num_retries-retries} retries remaining' ) + assert composer_model_wrapper is not None + return composer_model_wrapper + def load_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - fsdp_config: Optional[Dict], - num_retries: int) -> Optional[ComposerModel]: + fsdp_config: Optional[Dict], num_retries: int) -> ComposerModel: init_context = process_init_device(model_cfg, fsdp_config) retries = 0 + composer_model = None with init_context: - while retries < num_retries: + while retries < num_retries and composer_model is None: try: composer_model = COMPOSER_MODEL_REGISTRY[model_cfg.name]( model_cfg, tokenizer) - return composer_model except Exception as e: retries += 1 if retries >= num_retries: @@ -91,6 +93,9 @@ def load_model(model_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, f'Got exception {str(e)} while loading model {model_cfg.name}. {num_retries-retries} retries remaining' ) + assert composer_model is not None + return composer_model + def evaluate_model( model_cfg: DictConfig, @@ -124,11 +129,6 @@ def evaluate_model( icl_tasks, eval_gauntlet_config, tokenizer, device_eval_batch_size, max_seq_len, icl_subset_num_batches) - if eval_loader_config is not None: - loader_evaluators = build_eval_loader(eval_loader_config, tokenizer, - device_eval_batch_size) - evaluators.extend(loader_evaluators) - callbacks = [] if eval_gauntlet_callback is not None: callbacks.append(eval_gauntlet_callback) @@ -150,6 +150,15 @@ def evaluate_model( composer_model = load_model(model_cfg.model, tokenizer, fsdp_config, num_retries) + if eval_loader_config is not None: + loader_evaluators = build_eval_loader( + eval_loader_config, + composer_model, + tokenizer, + device_eval_batch_size, + ) + evaluators.extend(loader_evaluators) + if eval_gauntlet_df is None and eval_gauntlet_callback is not None: eval_gauntlet_df = pd.DataFrame( columns=['model_name'] + @@ -325,8 +334,9 @@ def main(cfg: DictConfig): if eval_gauntlet_df is not None and eval_gauntlet_callback is not None: assert composite_scores is not None row = {'model_name': model_cfg['model_name']} - row.update( - {k.split('/')[-1]: v for k, v in composite_scores.items()}) + row.update({ + k.split('/')[-1]: v for k, v in composite_scores.items() + }) eval_gauntlet_df = pd.concat( [eval_gauntlet_df, pd.DataFrame([row])], ignore_index=True) diff --git a/scripts/train/train.py b/scripts/train/train.py index e9a91b4536..4bb0f996d0 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -11,7 +11,6 @@ import torch from composer import Trainer -from composer.core import Evaluator from composer.core.callback import Callback from composer.loggers import MosaicMLLogger from composer.loggers.mosaicml_logger import (MOSAICML_ACCESS_TOKEN_ENV_VAR, @@ -525,25 +524,6 @@ def main(cfg: DictConfig) -> Trainer: if mosaicml_logger is not None: mosaicml_logger.log_metrics({'data_validated': time.time()}) - ## Evaluation - print('Building eval loader...') - evaluators = [] - eval_loaders = [] - if eval_loader_config is not None: - eval_loaders = build_eval_loader(eval_loader_config, tokenizer, - device_eval_batch_size) - - eval_gauntlet_callback = None - if icl_tasks_config is not None: - icl_evaluators, _, eval_gauntlet_callback = build_icl_data_and_gauntlet( - icl_tasks_config, eval_gauntlet_config, tokenizer, - device_eval_batch_size, icl_seq_len if icl_seq_len else max_seq_len, - icl_subset_num_batches) - evaluators.extend(icl_evaluators) - - if eval_gauntlet_callback is not None: - callbacks.append(eval_gauntlet_callback) - # Build Model print('Initializing model...') with init_context: @@ -568,13 +548,27 @@ def main(cfg: DictConfig) -> Trainer: optimizer_name: str = optimizer_config.pop('name') optimizer = build_optimizer(model, optimizer_name, optimizer_config) - # Now add the eval metrics + ## Evaluation + print('Building eval loader...') + evaluators = [] if eval_loader_config is not None: - assert model.train_metrics is not None - eval_metric_names = list(model.train_metrics.keys()) - for eval_loader in eval_loaders: - eval_loader.metric_names = eval_metric_names - evaluators.insert(0, eval_loader) # Put the base eval_loaders first + evaluators = build_eval_loader( + eval_loader_config, + model, + tokenizer, + device_eval_batch_size, + ) + + eval_gauntlet_callback = None + if icl_tasks_config is not None: + icl_evaluators, _, eval_gauntlet_callback = build_icl_data_and_gauntlet( + icl_tasks_config, eval_gauntlet_config, tokenizer, + device_eval_batch_size, icl_seq_len if icl_seq_len else max_seq_len, + icl_subset_num_batches) + evaluators.extend(icl_evaluators) + + if eval_gauntlet_callback is not None: + callbacks.append(eval_gauntlet_callback) # Build the Trainer print('Building trainer...') diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index 2080ec32ec..642b6baaed 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -21,6 +21,7 @@ from llmfoundry import (build_finetuning_dataloader, build_text_denoising_dataloader) +from llmfoundry.data import build_dataloader from llmfoundry.data.text_data import (ConcatenatedSequenceCollatorWrapper, build_text_dataloader, get_tokens_per_batch_func) @@ -732,3 +733,27 @@ def test_token_counting_func_dataloader_setting( actual_token_count = dl.get_num_tokens_in_batch(batch_tokenized) assert actual_token_count == expected_token_count + + +def test_build_unknown_dataloader(): + cfg = DictConfig({ + 'name': 'unknown', + 'dataset': { + 'local': 'dummy-path', + 'remote': 'dummy-path', + 'split': 'train', + 'max_seq_len': 1024, + 'shuffle': True, + 'shuffle_seed': 0, + }, + 'drop_last': False, + 'num_workers': 0, + 'prefetch_factor': None if using_torch_2() else 2, + 'pin_memory': False, + 'persistent_workers': False, + 'timeout': 0 + }) + tokenizer = MagicMock() + with pytest.raises(ValueError, + match='Expected dataloader name to be one of'): + _ = build_dataloader(cfg, tokenizer, 2) From 204d2f7c5a729ba5955f7723751e4db5a179e056 Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Mon, 27 Nov 2023 23:25:07 +0000 Subject: [PATCH 04/11] fix typing and formatting --- llmfoundry/data/dataloader.py | 9 +++------ llmfoundry/utils/builders.py | 24 ++++++++++++++---------- scripts/eval/eval.py | 5 ++--- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 71d31b6808..6974e3ba0b 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -18,11 +18,8 @@ } -def build_dataloader( - cfg: DictConfig, - tokenizer: PreTrainedTokenizerBase, - device_batch_size: int, -) -> DataSpec: +def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, + device_batch_size: int) -> DataSpec: """Builds a dataloader from a config. Args: @@ -33,7 +30,7 @@ def build_dataloader( """ if cfg.name not in LOADER_NAME_TO_FUNCTION: - allowed = ", ".join(LOADER_NAME_TO_FUNCTION.keys()) + allowed = ', '.join(LOADER_NAME_TO_FUNCTION.keys()) raise ValueError(f'Expected dataloader name to be one of {allowed}' + f' but found name "{cfg.name}" in config: {cfg}') diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index d2ea7f46c6..c2fbf3f71f 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -42,22 +42,26 @@ def build_eval_loader( - eval_loader_config: DictConfig, + eval_loader_config: Union[DictConfig, ListConfig], model: Union[Any, ComposerHFCausalLM], tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, -) -> Evaluator: +) -> List[Evaluator]: assert model.train_metrics is not None eval_metric_names = list(model.train_metrics.keys()) - evaluators = [] + evaluators: List[Evaluator] = [] + if isinstance(eval_loader_config, ListConfig): + eval_configs: ListConfig = eval_configs + is_multi_eval = True + else: + eval_configs = ListConfig([eval_loader_config]) + is_multi_eval = False - is_multi_eval = isinstance(eval_loader_config, ListConfig) - eval_configs = eval_loader_config if is_multi_eval else [eval_loader_config] for eval_config in eval_configs: eval_dataloader = build_dataloader(eval_config, tokenizer, device_eval_batch_size) - eval_loader = Evaluator( + eval_loader: Evaluator = Evaluator( label=f'eval/{eval_config.label}' if is_multi_eval else 'eval', dataloader=eval_dataloader, metric_names=eval_metric_names, @@ -220,8 +224,8 @@ def build_tokenizer( signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup' - if dist.is_available() and dist.is_initialized() and dist.get_world_size( - ) > 1: + if dist.is_available() and dist.is_initialized( + ) and dist.get_world_size() > 1: # Make sure the tokenizer files are downloaded and cached first by local rank 0 with dist.local_rank_zero_download_and_wait(signal_file_path): pass @@ -240,8 +244,8 @@ def build_tokenizer( int(1e30), ) - if dist.is_available() and dist.is_initialized() and dist.get_world_size( - ) > 1: + if dist.is_available() and dist.is_initialized( + ) and dist.get_world_size() > 1: if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_tokenizer_setup') diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index ef54e3234a..1f306f4de4 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -334,9 +334,8 @@ def main(cfg: DictConfig): if eval_gauntlet_df is not None and eval_gauntlet_callback is not None: assert composite_scores is not None row = {'model_name': model_cfg['model_name']} - row.update({ - k.split('/')[-1]: v for k, v in composite_scores.items() - }) + row.update( + {k.split('/')[-1]: v for k, v in composite_scores.items()}) eval_gauntlet_df = pd.concat( [eval_gauntlet_df, pd.DataFrame([row])], ignore_index=True) From 5b852184d983ca5caf294c81ab71f22318fa2e49 Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Mon, 27 Nov 2023 16:11:28 -0800 Subject: [PATCH 05/11] fixes, add tests --- llmfoundry/utils/builders.py | 8 +-- scripts/eval/eval.py | 4 ++ tests/data_utils.py | 95 ++++++++++++++++++++++++++++++++++++ tests/test_eval.py | 86 ++++++++++++++++++++++++++++++++ tests/test_training.py | 92 +--------------------------------- 5 files changed, 191 insertions(+), 94 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index c2fbf3f71f..2a89f94ef3 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -224,8 +224,8 @@ def build_tokenizer( signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup' - if dist.is_available() and dist.is_initialized( - ) and dist.get_world_size() > 1: + if dist.is_available() and dist.is_initialized() and dist.get_world_size( + ) > 1: # Make sure the tokenizer files are downloaded and cached first by local rank 0 with dist.local_rank_zero_download_and_wait(signal_file_path): pass @@ -244,8 +244,8 @@ def build_tokenizer( int(1e30), ) - if dist.is_available() and dist.is_initialized( - ) and dist.get_world_size() > 1: + if dist.is_available() and dist.is_initialized() and dist.get_world_size( + ) > 1: if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_tokenizer_setup') diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 1f306f4de4..49db6da122 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -292,6 +292,7 @@ def main(cfg: DictConfig): eval_gauntlet_df = None models_df = None composite_scores = None + trainers = [] for model_cfg in model_configs: (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) = evaluate_model( @@ -311,6 +312,7 @@ def main(cfg: DictConfig): precision=precision, eval_gauntlet_df=eval_gauntlet_df, icl_subset_num_batches=icl_subset_num_batches) + trainers.append(trainer) if eval_gauntlet_callback is not None: composite_scores = eval_gauntlet_callback.eval_after_all( @@ -349,6 +351,8 @@ def main(cfg: DictConfig): assert models_df is not None print(models_df.to_markdown(index=False)) + return trainers, eval_gauntlet_df + def calculate_markdown_results(logger_keys: List[str], trainer: Trainer, benchmark_to_taxonomy: Dict[str, str], diff --git a/tests/data_utils.py b/tests/data_utils.py index 075933de7d..0f3a80c299 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -3,8 +3,23 @@ import json import os +import pathlib +import shutil +import sys +from argparse import Namespace from typing import Optional +from omegaconf import DictConfig +from omegaconf import OmegaConf as om + +from scripts.data_prep.convert_dataset_hf import main as main_hf # noqa: E402 +from scripts.data_prep.convert_dataset_json import \ + main as main_json # noqa: E402 + +# Add repo root to path so we can import scripts and test it +repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.append(repo_dir) + def make_tiny_ft_dataset( path: str, @@ -65,3 +80,83 @@ def make_tiny_ft_dataset( for sample in samples: _f.write(json.dumps(sample)) _f.write('\n') + + +def create_c4_dataset_xsmall(path: pathlib.Path) -> str: + """Creates a small mocked version of the C4 dataset.""" + c4_dir = os.path.join(path, f'my-copy-c4') + downloaded_split = 'val_xsmall' # very fast to convert + + # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188 + main_hf( + Namespace( + **{ + 'dataset': 'c4', + 'data_subset': 'en', + 'splits': [downloaded_split], + 'out_root': c4_dir, + 'compression': None, + 'concat_tokens': 2048, + 'tokenizer': 'EleutherAI/gpt-neox-20b', + 'tokenizer_kwargs': {}, + 'bos_text': '', + 'eos_text': '<|endoftext|>', + 'no_wrap': False, + 'num_workers': 8 + })) + + # copy the small downloaded_split to other c4 splits for mocking purposes + mocked_splits = ['train', 'val'] + for mocked_split in mocked_splits: + shutil.copytree(os.path.join(c4_dir, 'val_xsmall'), + os.path.join(c4_dir, mocked_split)) + assert os.path.exists(c4_dir) + return c4_dir + + +def create_arxiv_dataset(path: pathlib.Path) -> str: + """Creates an arxiv dataset.""" + arxiv_dir = os.path.join(path, f'my-copy-arxiv') + downloaded_split = 'train' + + main_json( + Namespace( + **{ + 'path': 'data_prep/example_data/arxiv.jsonl', + 'out_root': arxiv_dir, + 'compression': None, + 'split': downloaded_split, + 'concat_tokens': None, + 'bos_text': None, + 'eos_text': None, + 'no_wrap': False, + 'num_workers': None + })) + + return arxiv_dir + + +def gpt_tiny_cfg(dataset_name: str, device: str): + """Create gpt tiny cfg.""" + conf_path: str = os.path.join(repo_dir, + 'scripts/train/yamls/pretrain/testing.yaml') + with open(conf_path) as f: + test_cfg = om.load(f) + assert isinstance(test_cfg, DictConfig) + + test_cfg.data_local = dataset_name + test_cfg.global_train_batch_size = 8 + test_cfg.device_eval_batch_size = 4 + test_cfg.device_train_microbatch_size = 4 + test_cfg.max_duration = '4ba' + test_cfg.eval_interval = '4ba' + test_cfg.run_name = 'gpt-mini-integration-test' + + if device == 'cpu': + test_cfg.model.init_device = 'cpu' + test_cfg.fsdp_config = None + test_cfg.model.attn_config.attn_impl = 'torch' + test_cfg.model.loss_fn = 'torch_crossentropy' + test_cfg.precision = 'fp32' + + return test_cfg diff --git a/tests/test_eval.py b/tests/test_eval.py index 1217487b70..5fcb8588e3 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -1,16 +1,21 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import copy import os +import pathlib import sys from typing import Any import omegaconf as om import pytest from composer import Trainer +from composer.loggers import InMemoryLogger from llmfoundry import COMPOSER_MODEL_REGISTRY from llmfoundry.utils import build_tokenizer +from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xsmall, + gpt_tiny_cfg) # Add repo root to path so we can import scripts and test it repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) @@ -66,3 +71,84 @@ def test_icl_eval(capfd: Any, mock_saved_model_path: Any): assert expected_results in out expected_results = '| model_name | default_average | language_understanding_lite |\n|:-------------|------------------:|------------------------------:|\n| tiny_mpt | 0 | 0 |' assert expected_results in out + + +@pytest.mark.gpu +def test_loader_eval(capfd: Any, mock_saved_model_path: Any, + tmp_path: pathlib.Path): + + c4_dataset_name = create_c4_dataset_xsmall(tmp_path) + + # Use a training config that already has eval loader configured + test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') + + # define icl eval task + test_cfg.icl_tasks = om.ListConfig([ + om.DictConfig({ + 'label': + 'lambada_openai', + 'dataset_uri': + 'eval/local_data/language_understanding/lambada_openai_small.jsonl', + 'num_fewshot': [0], + 'icl_task_type': + 'language_modeling' + }) + ]) + + # convert the model from a training to eval model + model = test_cfg.pop('model') + new_model = { + 'model_name': model.get('name'), + 'model': model, + 'load_path': mock_saved_model_path + } + + tokenizer = test_cfg.pop('tokenizer', None) + if tokenizer: + new_model['tokenizer'] = tokenizer + test_cfg.models = [new_model] + + # Set up multiple eval dataloaders + first_eval_loader = test_cfg.eval_loader + first_eval_loader.label = 'c4' + # Create second eval dataloader using the arxiv dataset. + second_eval_loader = copy.deepcopy(first_eval_loader) + arxiv_dataset_name = create_arxiv_dataset(tmp_path) + second_eval_loader.data_local = arxiv_dataset_name + second_eval_loader.label = 'arxiv' + test_cfg.eval_loader = om.OmegaConf.create( + [first_eval_loader, second_eval_loader]) + + trainers, eval_gauntlet_df = main(test_cfg) + assert eval_gauntlet_df is None + + assert len(trainers) == 1 # one per model + trainer = trainers[0] + + assert isinstance(trainer.logger.destinations, tuple) + + assert len(trainer.logger.destinations) > 0 + inmemorylogger = trainer.logger.destinations[ + 0] # pyright: ignore [reportGeneralTypeIssues] + assert isinstance(inmemorylogger, InMemoryLogger) + print(inmemorylogger.data.keys()) + + # Checks for first eval dataloader + assert 'metrics/eval/c4/LanguageCrossEntropy' in inmemorylogger.data.keys() + assert isinstance( + inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'], list) + assert len( + inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1]) > 0 + assert isinstance( + inmemorylogger.data['metrics/eval/c4/LanguageCrossEntropy'][-1], tuple) + + # Checks for second eval dataloader + assert 'metrics/eval/arxiv/LanguageCrossEntropy' in inmemorylogger.data.keys( + ) + assert isinstance( + inmemorylogger.data['metrics/eval/arxiv/LanguageCrossEntropy'], list) + assert len( + inmemorylogger.data['metrics/eval/arxiv/LanguageCrossEntropy'][-1]) > 0 + assert isinstance( + inmemorylogger.data['metrics/eval/arxiv/LanguageCrossEntropy'][-1], + tuple) diff --git a/tests/test_training.py b/tests/test_training.py index 214909cc28..596c62a785 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -3,9 +3,6 @@ import copy import os import pathlib -import shutil -import sys -from argparse import Namespace from typing import Any, Optional import pytest @@ -13,94 +10,9 @@ from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - -from scripts.data_prep.convert_dataset_hf import main as main_hf # noqa: E402 -from scripts.data_prep.convert_dataset_json import \ - main as main_json # noqa: E402 from scripts.train.train import main # noqa: E402 - - -def create_c4_dataset_xsmall(path: pathlib.Path) -> str: - """Creates a small mocked version of the C4 dataset.""" - c4_dir = os.path.join(path, f'my-copy-c4') - downloaded_split = 'val_xsmall' # very fast to convert - - # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188 - main_hf( - Namespace( - **{ - 'dataset': 'c4', - 'data_subset': 'en', - 'splits': [downloaded_split], - 'out_root': c4_dir, - 'compression': None, - 'concat_tokens': 2048, - 'tokenizer': 'EleutherAI/gpt-neox-20b', - 'tokenizer_kwargs': {}, - 'bos_text': '', - 'eos_text': '<|endoftext|>', - 'no_wrap': False, - 'num_workers': 8 - })) - - # copy the small downloaded_split to other c4 splits for mocking purposes - mocked_splits = ['train', 'val'] - for mocked_split in mocked_splits: - shutil.copytree(os.path.join(c4_dir, 'val_xsmall'), - os.path.join(c4_dir, mocked_split)) - assert os.path.exists(c4_dir) - return c4_dir - - -def create_arxiv_dataset(path: pathlib.Path) -> str: - """Creates an arxiv dataset.""" - arxiv_dir = os.path.join(path, f'my-copy-arxiv') - downloaded_split = 'train' - - main_json( - Namespace( - **{ - 'path': 'data_prep/example_data/arxiv.jsonl', - 'out_root': arxiv_dir, - 'compression': None, - 'split': downloaded_split, - 'concat_tokens': None, - 'bos_text': None, - 'eos_text': None, - 'no_wrap': False, - 'num_workers': None - })) - - return arxiv_dir - - -def gpt_tiny_cfg(dataset_name: str, device: str): - """Create gpt tiny cfg.""" - conf_path: str = os.path.join(repo_dir, - 'scripts/train/yamls/pretrain/testing.yaml') - with open(conf_path) as f: - test_cfg = om.load(f) - assert isinstance(test_cfg, DictConfig) - - test_cfg.data_local = dataset_name - test_cfg.global_train_batch_size = 8 - test_cfg.device_eval_batch_size = 4 - test_cfg.device_train_microbatch_size = 4 - test_cfg.max_duration = '4ba' - test_cfg.eval_interval = '4ba' - test_cfg.run_name = 'gpt-mini-integration-test' - - if device == 'cpu': - test_cfg.model.init_device = 'cpu' - test_cfg.fsdp_config = None - test_cfg.model.attn_config.attn_impl = 'torch' - test_cfg.model.loss_fn = 'torch_crossentropy' - test_cfg.precision = 'fp32' - - return test_cfg +from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xsmall, + gpt_tiny_cfg) @pytest.fixture(autouse=False) From 972ed965ad44ee583021fddc4e018ad2b0feb6f8 Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Mon, 27 Nov 2023 16:36:44 -0800 Subject: [PATCH 06/11] remove circular dependency --- llmfoundry/utils/builders.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 793ae8c99f..21913a6cdd 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -27,7 +27,6 @@ from torch.optim.optimizer import Optimizer from transformers import AutoTokenizer, PreTrainedTokenizerBase -from llmfoundry import ComposerHFCausalLM from llmfoundry.callbacks import (EvalGauntlet, FDiffMetrics, GlobalLRScaling, HuggingFaceCheckpointer, LayerFreezing, MonolithicCheckpointSaver, @@ -43,7 +42,7 @@ def build_eval_loader( eval_loader_config: Union[DictConfig, ListConfig], - model: Union[Any, ComposerHFCausalLM], + model: Any, tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, ) -> List[Evaluator]: From 98405895073c77e35e726be66c5dd4167ac55cab Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Tue, 28 Nov 2023 01:07:06 +0000 Subject: [PATCH 07/11] tests pass --- llmfoundry/data/dataloader.py | 1 - llmfoundry/utils/builders.py | 10 +++++----- tests/test_eval.py | 6 +++++- tests/test_training.py | 1 + 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 6974e3ba0b..63d47a65d5 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -28,7 +28,6 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size (int): The size of the batches (number of examples) that the dataloader will produce. """ - if cfg.name not in LOADER_NAME_TO_FUNCTION: allowed = ', '.join(LOADER_NAME_TO_FUNCTION.keys()) raise ValueError(f'Expected dataloader name to be one of {allowed}' + diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 21913a6cdd..8d796acfdf 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -51,7 +51,7 @@ def build_eval_loader( evaluators: List[Evaluator] = [] if isinstance(eval_loader_config, ListConfig): - eval_configs: ListConfig = eval_configs + eval_configs: ListConfig = eval_loader_config is_multi_eval = True else: eval_configs = ListConfig([eval_loader_config]) @@ -223,8 +223,8 @@ def build_tokenizer( signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed_tokenizer_setup' - if dist.is_available() and dist.is_initialized() and dist.get_world_size( - ) > 1: + if dist.is_available() and dist.is_initialized( + ) and dist.get_world_size() > 1: # Make sure the tokenizer files are downloaded and cached first by local rank 0 with dist.local_rank_zero_download_and_wait(signal_file_path): pass @@ -243,8 +243,8 @@ def build_tokenizer( int(1e30), ) - if dist.is_available() and dist.is_initialized() and dist.get_world_size( - ) > 1: + if dist.is_available() and dist.is_initialized( + ) and dist.get_world_size() > 1: if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_tokenizer_setup') diff --git a/tests/test_eval.py b/tests/test_eval.py index 5fcb8588e3..a9c738e651 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -119,9 +119,13 @@ def test_loader_eval(capfd: Any, mock_saved_model_path: Any, test_cfg.eval_loader = om.OmegaConf.create( [first_eval_loader, second_eval_loader]) + test_cfg.max_duration = '1ba' + test_cfg.eval_interval = '1ba' + test_cfg.loggers = om.DictConfig({'inmemory': om.DictConfig({})}) + trainers, eval_gauntlet_df = main(test_cfg) - assert eval_gauntlet_df is None + assert eval_gauntlet_df is None assert len(trainers) == 1 # one per model trainer = trainers[0] diff --git a/tests/test_training.py b/tests/test_training.py index 032f81a4e3..def78d62d0 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -15,6 +15,7 @@ from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xsmall, gpt_tiny_cfg) + @pytest.fixture(autouse=False) def set_correct_cwd(): if not os.getcwd().endswith('llm-foundry/scripts'): From bc1c7c7803a1b91bb43a02a06b4085acf360596f Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Wed, 29 Nov 2023 23:42:12 +0000 Subject: [PATCH 08/11] nits + small fixes --- llmfoundry/utils/builders.py | 8 ++++++-- scripts/eval/eval.py | 20 ++++++++++---------- scripts/train/train.py | 4 ++-- tests/data_utils.py | 19 ++++++++++--------- tests/test_dataloader.py | 14 -------------- tests/test_eval.py | 13 ++++++------- tests/test_training.py | 6 +++--- 7 files changed, 37 insertions(+), 47 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 8d796acfdf..564756902f 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -40,13 +40,17 @@ log = logging.getLogger(__name__) -def build_eval_loader( +def build_eval_loaders( eval_loader_config: Union[DictConfig, ListConfig], model: Any, tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, ) -> List[Evaluator]: - assert model.train_metrics is not None + if model.train_metrics is not None: + raise ValueError( + 'Eval loader requires metrics, either through the models defaults and/or train_metrics' + ) + eval_metric_names = list(model.train_metrics.keys()) evaluators: List[Evaluator] = [] diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 49db6da122..43bd7e7a96 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -6,7 +6,7 @@ import sys import time import warnings -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union import pandas as pd import torch @@ -21,7 +21,7 @@ from llmfoundry.models import MPTForCausalLM from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY -from llmfoundry.utils.builders import (build_eval_loader, +from llmfoundry.utils.builders import (build_eval_loaders, build_icl_data_and_gauntlet, build_logger, build_tokenizer) from llmfoundry.utils.config_utils import pop_config, process_init_device @@ -151,13 +151,13 @@ def evaluate_model( num_retries) if eval_loader_config is not None: - loader_evaluators = build_eval_loader( - eval_loader_config, - composer_model, - tokenizer, - device_eval_batch_size, - ) - evaluators.extend(loader_evaluators) + evaluators.extend( + build_eval_loaders( + eval_loader_config, + composer_model, + tokenizer, + device_eval_batch_size, + )) if eval_gauntlet_df is None and eval_gauntlet_callback is not None: eval_gauntlet_df = pd.DataFrame( @@ -202,7 +202,7 @@ def evaluate_model( return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) -def main(cfg: DictConfig): +def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: om.resolve(cfg) model_configs: ListConfig = pop_config(cfg, 'models', must_exist=True) eval_gauntlet_config: Optional[Union[str, DictConfig]] = pop_config( diff --git a/scripts/train/train.py b/scripts/train/train.py index 4bb0f996d0..34ebbf9219 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -26,7 +26,7 @@ MPTForCausalLM) from llmfoundry.data.dataloader import build_dataloader from llmfoundry.utils.builders import (build_algorithm, build_callback, - build_eval_loader, + build_eval_loaders, build_icl_data_and_gauntlet, build_logger, build_optimizer, build_scheduler, build_tokenizer) @@ -552,7 +552,7 @@ def main(cfg: DictConfig) -> Trainer: print('Building eval loader...') evaluators = [] if eval_loader_config is not None: - evaluators = build_eval_loader( + evaluators = build_eval_loaders( eval_loader_config, model, tokenizer, diff --git a/tests/data_utils.py b/tests/data_utils.py index 0f3a80c299..efb4f6d7cf 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -1,11 +1,16 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -import json import os +import sys + +# Add repo root to path so we can import scripts and test it +repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +sys.path.append(repo_dir) + +import json import pathlib import shutil -import sys from argparse import Namespace from typing import Optional @@ -16,10 +21,6 @@ from scripts.data_prep.convert_dataset_json import \ main as main_json # noqa: E402 -# Add repo root to path so we can import scripts and test it -repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) -sys.path.append(repo_dir) - def make_tiny_ft_dataset( path: str, @@ -82,10 +83,10 @@ def make_tiny_ft_dataset( _f.write('\n') -def create_c4_dataset_xsmall(path: pathlib.Path) -> str: +def create_c4_dataset_xxsmall(path: pathlib.Path) -> str: """Creates a small mocked version of the C4 dataset.""" c4_dir = os.path.join(path, f'my-copy-c4') - downloaded_split = 'val_xsmall' # very fast to convert + downloaded_split = 'val_xxsmall' # very fast to convert # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188 main_hf( @@ -108,7 +109,7 @@ def create_c4_dataset_xsmall(path: pathlib.Path) -> str: # copy the small downloaded_split to other c4 splits for mocking purposes mocked_splits = ['train', 'val'] for mocked_split in mocked_splits: - shutil.copytree(os.path.join(c4_dir, 'val_xsmall'), + shutil.copytree(os.path.join(c4_dir, 'val_xxsmall'), os.path.join(c4_dir, mocked_split)) assert os.path.exists(c4_dir) return c4_dir diff --git a/tests/test_dataloader.py b/tests/test_dataloader.py index f0215d6965..2e9039644b 100644 --- a/tests/test_dataloader.py +++ b/tests/test_dataloader.py @@ -746,20 +746,6 @@ def test_token_counting_func_dataloader_setting( def test_build_unknown_dataloader(): cfg = DictConfig({ 'name': 'unknown', - 'dataset': { - 'local': 'dummy-path', - 'remote': 'dummy-path', - 'split': 'train', - 'max_seq_len': 1024, - 'shuffle': True, - 'shuffle_seed': 0, - }, - 'drop_last': False, - 'num_workers': 0, - 'prefetch_factor': None if using_torch_2() else 2, - 'pin_memory': False, - 'persistent_workers': False, - 'timeout': 0 }) tokenizer = MagicMock() with pytest.raises(ValueError, diff --git a/tests/test_eval.py b/tests/test_eval.py index a9c738e651..2fc96bb7ad 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -14,7 +14,7 @@ from llmfoundry import COMPOSER_MODEL_REGISTRY from llmfoundry.utils import build_tokenizer -from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xsmall, +from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, gpt_tiny_cfg) # Add repo root to path so we can import scripts and test it @@ -77,7 +77,7 @@ def test_icl_eval(capfd: Any, mock_saved_model_path: Any): def test_loader_eval(capfd: Any, mock_saved_model_path: Any, tmp_path: pathlib.Path): - c4_dataset_name = create_c4_dataset_xsmall(tmp_path) + c4_dataset_name = create_c4_dataset_xxsmall(tmp_path) # Use a training config that already has eval loader configured test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') @@ -97,16 +97,15 @@ def test_loader_eval(capfd: Any, mock_saved_model_path: Any, # convert the model from a training to eval model model = test_cfg.pop('model') - new_model = { + eval_model = { 'model_name': model.get('name'), 'model': model, 'load_path': mock_saved_model_path } - tokenizer = test_cfg.pop('tokenizer', None) - if tokenizer: - new_model['tokenizer'] = tokenizer - test_cfg.models = [new_model] + tokenizer = test_cfg.pop('tokenizer') + eval_model['tokenizer'] = tokenizer + test_cfg.models = [eval_model] # Set up multiple eval dataloaders first_eval_loader = test_cfg.eval_loader diff --git a/tests/test_training.py b/tests/test_training.py index def78d62d0..3cd2963100 100644 --- a/tests/test_training.py +++ b/tests/test_training.py @@ -12,7 +12,7 @@ from omegaconf import OmegaConf as om from scripts.train.train import main # noqa: E402 -from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xsmall, +from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, gpt_tiny_cfg) @@ -33,7 +33,7 @@ def set_correct_cwd(): def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any, tmp_path: pathlib.Path): """Test training run with a small dataset.""" - dataset_name = create_c4_dataset_xsmall(tmp_path) + dataset_name = create_c4_dataset_xxsmall(tmp_path) test_cfg = gpt_tiny_cfg(dataset_name, 'cpu') test_cfg.icl_tasks = ListConfig([ DictConfig({ @@ -112,7 +112,7 @@ def test_train_gauntlet(averages: Optional[dict], set_correct_cwd: Any, def test_train_multi_eval(set_correct_cwd: Any, tmp_path: pathlib.Path): """Test training run with multiple eval datasets.""" - c4_dataset_name = create_c4_dataset_xsmall(tmp_path) + c4_dataset_name = create_c4_dataset_xxsmall(tmp_path) test_cfg = gpt_tiny_cfg(c4_dataset_name, 'cpu') # Set up multiple eval dataloaders first_eval_loader = test_cfg.eval_loader From 1050deff296484c93c9a2416eee96862e6394951 Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Thu, 30 Nov 2023 00:39:37 +0000 Subject: [PATCH 09/11] add metrics at the end, refactor to put icl/gauntlet as helpers --- llmfoundry/utils/builders.py | 69 +++++++++++++++++++++++++++++++----- scripts/eval/eval.py | 27 +++++++------- scripts/train/train.py | 48 ++++++++++++------------- 3 files changed, 97 insertions(+), 47 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 564756902f..4f6e8644a6 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -40,19 +40,46 @@ log = logging.getLogger(__name__) -def build_eval_loaders( - eval_loader_config: Union[DictConfig, ListConfig], - model: Any, +def build_evaluators( + eval_loader_config: Optional[Union[DictConfig, ListConfig]], + icl_tasks_config: Optional[Union[str, ListConfig]], + eval_gauntlet_config: Optional[Union[str, DictConfig]], + *, tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, -) -> List[Evaluator]: - if model.train_metrics is not None: - raise ValueError( - 'Eval loader requires metrics, either through the models defaults and/or train_metrics' + icl_seq_len: int, + icl_subset_num_batches: Optional[int], +) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: + + evaluators = [] + if eval_loader_config is not None: + evaluators = build_eval_loaders( + eval_loader_config, + tokenizer, + device_eval_batch_size, ) - eval_metric_names = list(model.train_metrics.keys()) + logger_keys = [] + eval_gauntlet_callback = None + if icl_tasks_config is not None: + icl_evaluators, logger_keys, eval_gauntlet_callback = build_icl_data_and_gauntlet( + icl_tasks_config, + eval_gauntlet_config, + tokenizer, + device_eval_batch_size, + icl_seq_len, + icl_subset_num_batches, + ) + evaluators.extend(icl_evaluators) + + return evaluators, logger_keys, eval_gauntlet_callback + +def build_eval_loaders( + eval_loader_config: Union[DictConfig, ListConfig], + tokenizer: PreTrainedTokenizerBase, + device_eval_batch_size: int, +) -> List[Evaluator]: evaluators: List[Evaluator] = [] if isinstance(eval_loader_config, ListConfig): eval_configs: ListConfig = eval_loader_config @@ -67,12 +94,36 @@ def build_eval_loaders( eval_loader: Evaluator = Evaluator( label=f'eval/{eval_config.label}' if is_multi_eval else 'eval', dataloader=eval_dataloader, - metric_names=eval_metric_names, + # Load the eval data to fail fast. metrics will get added + # later in add_metrics_to_eval_loaders, after the model is loaded + metric_names=[], ) evaluators.append(eval_loader) return evaluators +def add_metrics_to_eval_loaders( + evaluators: List[Evaluator], + model: Any, +) -> List[Evaluator]: + if model.train_metrics is not None: + raise ValueError( + 'Eval loader requires metrics, either through the models defaults and/or train_metrics' + ) + + eval_metric_names = list(model.train_metrics.keys()) + eval_loaders, other_evaluators = [], [] + for evaluator in evaluators: + if evaluator.metric_names == []: + evaluator.metric_names = eval_metric_names + eval_loaders.append(evaluator) + else: + other_evaluators.append(evaluator) + + # Put the base eval_loaders first + return eval_loaders + other_evaluators + + def build_icl_data_and_gauntlet( icl_tasks_config: Union[str, ListConfig], eval_gauntlet_config: Optional[Union[str, DictConfig]], diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 43bd7e7a96..b59de6ac5e 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -21,9 +21,9 @@ from llmfoundry.models import MPTForCausalLM from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY -from llmfoundry.utils.builders import (build_eval_loaders, - build_icl_data_and_gauntlet, - build_logger, build_tokenizer) +from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, + build_evaluators, build_logger, + build_tokenizer) from llmfoundry.utils.config_utils import pop_config, process_init_device @@ -125,9 +125,15 @@ def evaluate_model( tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - evaluators, logger_keys, eval_gauntlet_callback = build_icl_data_and_gauntlet( - icl_tasks, eval_gauntlet_config, tokenizer, device_eval_batch_size, - max_seq_len, icl_subset_num_batches) + evaluators, logger_keys, eval_gauntlet_callback = build_evaluators( + eval_loader_config, + icl_tasks, + eval_gauntlet_config, + tokenizer=tokenizer, + device_eval_batch_size=device_eval_batch_size, + icl_seq_len=max_seq_len, + icl_subset_num_batches=icl_subset_num_batches, + ) callbacks = [] if eval_gauntlet_callback is not None: @@ -150,14 +156,9 @@ def evaluate_model( composer_model = load_model(model_cfg.model, tokenizer, fsdp_config, num_retries) + # Now add the eval metrics if eval_loader_config is not None: - evaluators.extend( - build_eval_loaders( - eval_loader_config, - composer_model, - tokenizer, - device_eval_batch_size, - )) + evaluators = add_metrics_to_eval_loaders(evaluators, composer_model) if eval_gauntlet_df is None and eval_gauntlet_callback is not None: eval_gauntlet_df = pd.DataFrame( diff --git a/scripts/train/train.py b/scripts/train/train.py index 34ebbf9219..c665d85d62 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -25,11 +25,11 @@ from llmfoundry import (COMPOSER_MODEL_REGISTRY, ComposerHFCausalLM, MPTForCausalLM) from llmfoundry.data.dataloader import build_dataloader -from llmfoundry.utils.builders import (build_algorithm, build_callback, - build_eval_loaders, - build_icl_data_and_gauntlet, - build_logger, build_optimizer, - build_scheduler, build_tokenizer) +from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, + build_algorithm, build_callback, + build_evaluators, build_logger, + build_optimizer, build_scheduler, + build_tokenizer) from llmfoundry.utils.config_utils import (log_config, pop_config, process_init_device, update_batch_size_info) @@ -524,6 +524,22 @@ def main(cfg: DictConfig) -> Trainer: if mosaicml_logger is not None: mosaicml_logger.log_metrics({'data_validated': time.time()}) + ## Evaluation + print('Building eval loader...') + eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len + evaluators, _, eval_gauntlet_callback = build_evaluators( + eval_loader_config, + icl_tasks_config, + eval_gauntlet_config, + tokenizer=tokenizer, + device_eval_batch_size=device_eval_batch_size, + icl_seq_len=eval_icl_seq_len, + icl_subset_num_batches=icl_subset_num_batches, + ) + + if eval_gauntlet_callback is not None: + callbacks.append(eval_gauntlet_callback) + # Build Model print('Initializing model...') with init_context: @@ -548,27 +564,9 @@ def main(cfg: DictConfig) -> Trainer: optimizer_name: str = optimizer_config.pop('name') optimizer = build_optimizer(model, optimizer_name, optimizer_config) - ## Evaluation - print('Building eval loader...') - evaluators = [] + # Now add the eval metrics if eval_loader_config is not None: - evaluators = build_eval_loaders( - eval_loader_config, - model, - tokenizer, - device_eval_batch_size, - ) - - eval_gauntlet_callback = None - if icl_tasks_config is not None: - icl_evaluators, _, eval_gauntlet_callback = build_icl_data_and_gauntlet( - icl_tasks_config, eval_gauntlet_config, tokenizer, - device_eval_batch_size, icl_seq_len if icl_seq_len else max_seq_len, - icl_subset_num_batches) - evaluators.extend(icl_evaluators) - - if eval_gauntlet_callback is not None: - callbacks.append(eval_gauntlet_callback) + evaluators = add_metrics_to_eval_loaders(evaluators, model) # Build the Trainer print('Building trainer...') From 549abb62703b03058711af4757db6f05bea676f9 Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Thu, 30 Nov 2023 00:50:31 +0000 Subject: [PATCH 10/11] NOT --- llmfoundry/utils/builders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 82c7420721..fe49def9de 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -109,7 +109,7 @@ def add_metrics_to_eval_loaders( evaluators: List[Evaluator], model: Any, ) -> List[Evaluator]: - if model.train_metrics is not None: + if model.train_metrics is None: raise ValueError( 'Eval loader requires metrics, either through the models defaults and/or train_metrics' ) From 084d8b64c5522a2b68d95d9ad110713c63c72834 Mon Sep 17 00:00:00 2001 From: Anna Pfohl Date: Thu, 30 Nov 2023 18:28:30 +0000 Subject: [PATCH 11/11] metrics instead of models, add unit tests --- llmfoundry/utils/builders.py | 12 ++-- scripts/eval/eval.py | 3 +- scripts/train/train.py | 3 +- tests/test_builders.py | 118 ++++++++++++++++++++++++++++++++++- 4 files changed, 125 insertions(+), 11 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index fe49def9de..a672fbee55 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -28,6 +28,7 @@ from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om from torch.optim.optimizer import Optimizer +from torchmetrics import Metric from transformers import AutoTokenizer, PreTrainedTokenizerBase from llmfoundry.callbacks import (EvalGauntlet, FDiffMetrics, GlobalLRScaling, @@ -107,18 +108,13 @@ def build_eval_loaders( def add_metrics_to_eval_loaders( evaluators: List[Evaluator], - model: Any, + metrics: Dict[str, Metric], ) -> List[Evaluator]: - if model.train_metrics is None: - raise ValueError( - 'Eval loader requires metrics, either through the models defaults and/or train_metrics' - ) - - eval_metric_names = list(model.train_metrics.keys()) + metric_names = list(metrics.keys()) eval_loaders, other_evaluators = [], [] for evaluator in evaluators: if evaluator.metric_names == []: - evaluator.metric_names = eval_metric_names + evaluator.metric_names = metric_names eval_loaders.append(evaluator) else: other_evaluators.append(evaluator) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index b59de6ac5e..369a894720 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -158,7 +158,8 @@ def evaluate_model( # Now add the eval metrics if eval_loader_config is not None: - evaluators = add_metrics_to_eval_loaders(evaluators, composer_model) + train_metrics = composer_model.get_metrics(is_train=True) + evaluators = add_metrics_to_eval_loaders(evaluators, train_metrics) if eval_gauntlet_df is None and eval_gauntlet_callback is not None: eval_gauntlet_df = pd.DataFrame( diff --git a/scripts/train/train.py b/scripts/train/train.py index c665d85d62..809f2fb09c 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -566,7 +566,8 @@ def main(cfg: DictConfig) -> Trainer: # Now add the eval metrics if eval_loader_config is not None: - evaluators = add_metrics_to_eval_loaders(evaluators, model) + train_metrics = model.get_metrics(is_train=True) + evaluators = add_metrics_to_eval_loaders(evaluators, train_metrics) # Build the Trainer print('Building trainer...') diff --git a/tests/test_builders.py b/tests/test_builders.py index 7ac179720e..5c38ed8602 100644 --- a/tests/test_builders.py +++ b/tests/test_builders.py @@ -5,17 +5,22 @@ import unittest.mock as mock from copy import deepcopy from typing import Any, Dict, Union +from unittest.mock import MagicMock import pytest import torch import torch.nn as nn from composer.callbacks import Generate +from composer.core import Evaluator +from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om from transformers import PreTrainedTokenizerBase from llmfoundry.callbacks import HuggingFaceCheckpointer from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper -from llmfoundry.utils.builders import (build_callback, build_optimizer, +from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, + build_callback, build_eval_loaders, + build_evaluators, build_optimizer, build_tokenizer) @@ -195,3 +200,114 @@ def test_build_optimizer(name: str, optimizer_config: Dict[str, Any], for n, p in model.named_parameters(): if re.search(param_str_match, n): assert id(p) in param_ids + + +def test_build_evaluators_empty(): + evaluators, logger_keys, eval_gauntlet_callback = build_evaluators( + None, + None, + None, + tokenizer=None, # type: ignore + device_eval_batch_size=1, + icl_seq_len=2, + icl_subset_num_batches=3) + assert evaluators == [] + assert logger_keys == [] + assert eval_gauntlet_callback is None + + +def test_build_eval_loaders(monkeypatch: pytest.MonkeyPatch): + tokenizer = TiktokenTokenizerWrapper(model_name='gpt-4') + + eval_loader_cfg = DictConfig({ + 'name': 'text', + 'dataset': { + # mocked, not needed + }, + 'drop_last': False, + 'num_workers': 8, + }) + monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', + lambda *args, **kwargs: MagicMock()) + eval_loaders = build_eval_loaders(eval_loader_cfg, tokenizer, 2) + + assert len(eval_loaders) == 1 + + assert eval_loaders[0].label == 'eval' + assert eval_loaders[0].dataloader is not None + assert eval_loaders[0].metric_names == [] + + multi_eval_loader_cfg = ListConfig([ + { + 'name': 'text', + 'label': 'test1', + 'dataset': { + # mocked, not needed + }, + 'drop_last': False, + 'num_workers': 8, + }, + { + 'name': 'text', + 'label': 'test2', + 'dataset': { + # mocked, not needed + }, + 'drop_last': False, + 'num_workers': 8, + } + ]) + monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', + lambda *args, **kwargs: MagicMock()) + eval_loaders2 = build_eval_loaders(multi_eval_loader_cfg, tokenizer, 2) + + assert len(eval_loaders2) == 2 + + assert eval_loaders2[0].label == 'eval/test1' + assert eval_loaders2[0].dataloader is not None + assert eval_loaders2[0].metric_names == [] + + assert eval_loaders2[1].label == 'eval/test2' + assert eval_loaders2[1].dataloader is not None + assert eval_loaders2[1].metric_names == [] + + +def test_add_metrics_to_eval_loaders(): + evaluators = [ + Evaluator( + label='first', + metric_names=['a', 'b'], + dataloader=None, # type: ignore + device_eval_microbatch_size=1, + ), + Evaluator( + label='second', + metric_names=[], + dataloader=None, # type: ignore + device_eval_microbatch_size=1, + ), + Evaluator( + label='third', + metric_names=['c'], + dataloader=None, # type: ignore + device_eval_microbatch_size=1, + ) + ] + + new_evaluators = add_metrics_to_eval_loaders( + evaluators, + { + 'new1': 'foo', + 'new2': 'bar' + }, # type: ignore + ) + assert len(new_evaluators) == 3 + + assert new_evaluators[0].label == 'second' + assert new_evaluators[0].metric_names == ['new1', 'new2'] + + assert new_evaluators[1].label == 'first' + assert new_evaluators[1].metric_names == ['a', 'b'] + + assert new_evaluators[2].label == 'third' + assert new_evaluators[2].metric_names == ['c']