From aa433577c290406b3ff204dc806ec8054edf64aa Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 28 Oct 2023 14:56:50 -0400 Subject: [PATCH 1/3] cleanup verbosity a bit --- src/axolotl/train.py | 11 +++++++++-- src/axolotl/utils/distributed.py | 11 +++++++++++ src/axolotl/utils/trainer.py | 24 +++++++++++++++--------- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/axolotl/train.py b/src/axolotl/train.py index 468d25e14..e5cfdfb42 100644 --- a/src/axolotl/train.py +++ b/src/axolotl/train.py @@ -17,6 +17,7 @@ from axolotl.common.cli import TrainerCliArgs from axolotl.logging_config import configure_logging from axolotl.utils.dict import DictDefault +from axolotl.utils.distributed import zero_only from axolotl.utils.models import load_model, load_tokenizer from axolotl.utils.trainer import setup_trainer @@ -43,7 +44,10 @@ def train( *, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta ): # load the tokenizer first - LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}") + with zero_only(): + LOG.debug( + f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}" + ) tokenizer = load_tokenizer(cfg) train_dataset = dataset_meta.train_dataset @@ -51,7 +55,10 @@ def train( total_num_steps = dataset_meta.total_num_steps # Load the model and tokenizer - LOG.info("loading model and (optionally) peft_config...") + msg = "loading model" + if cfg.adapter: + msg += " and peft_config..." + LOG.debug(msg) model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference) safe_serialization = cfg.save_safetensors is True diff --git a/src/axolotl/utils/distributed.py b/src/axolotl/utils/distributed.py index 9a1c689fb..313dd24e8 100644 --- a/src/axolotl/utils/distributed.py +++ b/src/axolotl/utils/distributed.py @@ -50,6 +50,17 @@ def get_world_size(): return int(os.getenv("WORLD_SIZE", "1")) +@contextmanager +def zero_only(): + """ + Context manager that only runs the enclosed block on the main rank. + """ + if is_main_process(): + yield + else: + yield None + + @contextmanager def zero_first(is_main): """ diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index d04390293..9a6afcd17 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -21,6 +21,7 @@ is_main_process, reduce_and_broadcast, zero_first, + zero_only, ) LOG = logging.getLogger("axolotl") @@ -153,14 +154,14 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer): # we have to drop anything longer then sequence len otherwise # flash attention with position ids fails if not cfg.total_num_tokens: - LOG.info("calculating total_num_tokens") total_num_tokens = np.sum( train_dataset.data.column("input_ids") .to_pandas() .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda .values ) - LOG.info(f"total_num_tokens: {total_num_tokens}") + with zero_only(): + LOG.debug(f"total_num_tokens: {total_num_tokens}") cfg.total_num_tokens = total_num_tokens if not cfg.total_supervised_tokens: @@ -170,7 +171,8 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer): .apply(lambda x: np.sum(np.array(x) != -100)) .sum() ) - LOG.info(f"`total_supervised_tokens: {total_supervised_tokens}`") + with zero_only(): + LOG.debug(f"`total_supervised_tokens: {total_supervised_tokens}`") cfg.total_supervised_tokens = total_supervised_tokens if cfg.sample_packing_eff_est: @@ -189,9 +191,10 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer): ) * cfg.num_epochs ) - LOG.info( - f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}" - ) + with zero_only(): + LOG.debug( + f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}" + ) else: if cfg.world_size > 1 and is_distributed(): sampler = DistributedSampler( @@ -220,7 +223,8 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer): ) data_loader_len = data_loader.len_w_stats() actual_eff = data_loader.efficiency() - LOG.info(f"data_loader_len: {data_loader_len}") + with zero_only(): + LOG.debug(f"data_loader_len: {data_loader_len}") # FIXME: is there a bug here somewhere? the total num steps depends # on the agreed on value for sample_packing_eff_est total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs)) @@ -237,12 +241,14 @@ def calc_sample_packing_eff_est(estimates: List[float]): math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0 ) cfg.sample_packing_eff_est = sample_packing_eff_est - LOG.info(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}") + with zero_only(): + LOG.debug(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}") else: total_num_steps = int( math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size) ) - LOG.info(f"total_num_steps: {total_num_steps}") + with zero_only(): + LOG.debug(f"total_num_steps: {total_num_steps}") return total_num_steps From 804abde8129b9e48f10514433952eeb9f7a147f3 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 28 Oct 2023 14:59:36 -0400 Subject: [PATCH 2/3] chore: lint --- gitbook/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/gitbook/README.md b/gitbook/README.md index 5c4b4d58a..642bde22a 100644 --- a/gitbook/README.md +++ b/gitbook/README.md @@ -1,2 +1 @@ # Page - From 4cbd2459066c61fd1cef92ff462c0d024967d882 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 6 Nov 2023 07:27:42 -0500 Subject: [PATCH 3/3] use accelerate logging for zero/main loggin only --- src/axolotl/train.py | 13 ++++++------- src/axolotl/utils/trainer.py | 34 +++++++++++++++++----------------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/src/axolotl/train.py b/src/axolotl/train.py index e5cfdfb42..a87746071 100644 --- a/src/axolotl/train.py +++ b/src/axolotl/train.py @@ -1,6 +1,5 @@ """Prepare and train a model on a dataset. Can also infer from a model or merge lora""" -import logging import os import signal import sys @@ -10,6 +9,7 @@ import torch import transformers.modelcard +from accelerate.logging import get_logger from datasets import Dataset from optimum.bettertransformer import BetterTransformer from transformers.deepspeed import is_deepspeed_zero3_enabled @@ -17,7 +17,6 @@ from axolotl.common.cli import TrainerCliArgs from axolotl.logging_config import configure_logging from axolotl.utils.dict import DictDefault -from axolotl.utils.distributed import zero_only from axolotl.utils.models import load_model, load_tokenizer from axolotl.utils.trainer import setup_trainer @@ -26,7 +25,7 @@ sys.path.insert(0, src_dir) configure_logging() -LOG = logging.getLogger("axolotl.train") +LOG = get_logger("axolotl.train") @dataclass @@ -44,10 +43,10 @@ def train( *, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta ): # load the tokenizer first - with zero_only(): - LOG.debug( - f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}" - ) + LOG.debug( + f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}", + main_process_only=True, + ) tokenizer = load_tokenizer(cfg) train_dataset = dataset_meta.train_dataset diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 9a6afcd17..6bcf0dac1 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -1,5 +1,4 @@ """Module containing the Trainer class and related functions""" -import logging import math import os from contextlib import contextmanager @@ -10,6 +9,7 @@ import torch import torch.cuda import torch.distributed as dist +from accelerate.logging import get_logger from datasets import set_caching_enabled from torch.utils.data import DistributedSampler, RandomSampler @@ -21,10 +21,9 @@ is_main_process, reduce_and_broadcast, zero_first, - zero_only, ) -LOG = logging.getLogger("axolotl") +LOG = get_logger("axolotl") @torch.jit.script @@ -160,8 +159,7 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer): .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda .values ) - with zero_only(): - LOG.debug(f"total_num_tokens: {total_num_tokens}") + LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True) cfg.total_num_tokens = total_num_tokens if not cfg.total_supervised_tokens: @@ -171,8 +169,10 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer): .apply(lambda x: np.sum(np.array(x) != -100)) .sum() ) - with zero_only(): - LOG.debug(f"`total_supervised_tokens: {total_supervised_tokens}`") + LOG.debug( + f"`total_supervised_tokens: {total_supervised_tokens}`", + main_process_only=True, + ) cfg.total_supervised_tokens = total_supervised_tokens if cfg.sample_packing_eff_est: @@ -191,10 +191,10 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer): ) * cfg.num_epochs ) - with zero_only(): - LOG.debug( - f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}" - ) + LOG.debug( + f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}", + main_process_only=True, + ) else: if cfg.world_size > 1 and is_distributed(): sampler = DistributedSampler( @@ -223,8 +223,7 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer): ) data_loader_len = data_loader.len_w_stats() actual_eff = data_loader.efficiency() - with zero_only(): - LOG.debug(f"data_loader_len: {data_loader_len}") + LOG.debug(f"data_loader_len: {data_loader_len}", main_process_only=True) # FIXME: is there a bug here somewhere? the total num steps depends # on the agreed on value for sample_packing_eff_est total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs)) @@ -241,14 +240,15 @@ def calc_sample_packing_eff_est(estimates: List[float]): math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0 ) cfg.sample_packing_eff_est = sample_packing_eff_est - with zero_only(): - LOG.debug(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}") + LOG.debug( + f"sample_packing_eff_est: {cfg.sample_packing_eff_est}", + main_process_only=True, + ) else: total_num_steps = int( math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size) ) - with zero_only(): - LOG.debug(f"total_num_steps: {total_num_steps}") + LOG.debug(f"total_num_steps: {total_num_steps}", main_process_only=True) return total_num_steps