From 4be387d23b61ff1a6df2367aad3510f525535560 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 30 Jan 2023 18:39:23 -0500 Subject: [PATCH] Profiling and misc (#10) --- Dockerfile | 4 +- Makefile | 52 +++------ requirements.txt | 2 +- src/__init__.py | 0 src/main.py | 18 ++-- src/pipelines/__init__.py | 14 ++- src/pipelines/ds.py | 25 +++++ src/pipelines/ds_inference.py | 39 ------- src/pipelines/hf.py | 32 ------ src/pipelines/pipeline.py | 175 ++++++++++++++++++++++--------- src/pipelines/transformers.py | 10 ++ src/utils/__init__.py | 3 - src/utils/arguments.py | 68 ++++++++++-- src/utils/benchmark.py | 156 ++++++++++++++++----------- src/utils/fast_init.py | 52 +++++++++ src/utils/{dummy.py => input.py} | 0 src/utils/logging.py | 45 ++++++++ src/utils/utils.py | 31 +----- transformers | 2 +- 19 files changed, 454 insertions(+), 274 deletions(-) create mode 100644 src/__init__.py create mode 100644 src/pipelines/ds.py delete mode 100644 src/pipelines/ds_inference.py delete mode 100644 src/pipelines/hf.py create mode 100644 src/pipelines/transformers.py create mode 100644 src/utils/fast_init.py rename src/utils/{dummy.py => input.py} (100%) create mode 100644 src/utils/logging.py diff --git a/Dockerfile b/Dockerfile index 2b24dc242a74b3..c8a786fa239ee1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,8 @@ RUN useradd -m -u $USER -s /bin/bash $USERNAME \ COPY --chown=$USERNAME ./requirements.txt ./ COPY --chown=$USERNAME transformers/ ./transformers -RUN pip install -r requirements.txt + +# Stock version of pip doesn't work with editable transformers. +RUN pip install --upgrade pip --no-cache-dir && pip install -r requirements.txt --no-cache-dir COPY --chown=$USERNAME src/ ./src diff --git a/Makefile b/Makefile index 3d73df02bd3f19..c147da5bb560d3 100644 --- a/Makefile +++ b/Makefile @@ -6,69 +6,49 @@ style: batch_size := 1 -install-mqa-transformers: - git clone https://github.com/bigcode-project/transformers.git; \ - cd transformers; \ - git checkout mayank/multi_query; \ - pip install .; \ - cd ..; \ - rm -rf transformers; +install: + git submodule update --init + pip install -r requirements.txt # BLOOM AliBi hf-1b-bloom-fp32: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size} hf-1b-bloom-bf16: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size} hf-1b-bloom-int8: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size} ds-inference-1b-bloom-fp16: - deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class BLOOM --batch_size ${batch_size} + deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class BLOOM --batch_size ${batch_size} # GPT2 MHA hf-1b-GPT2-mha-fp32: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size} hf-1b-GPT2-mha-bf16: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size} hf-1b-GPT2-mha-int8: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} ds-inference-1b-GPT2-mha-fp16: - deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size} - -# GPT2 MQA -hf-1b-GPT2-mqa-fp32: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size} - -hf-1b-GPT2-mqa-bf16: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size} - -hf-1b-GPT2-mqa-int8: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size} - -ds-inference-1b-GPT2-mqa-fp16: - deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --batch_size ${batch_size} + deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size} # GPT2 MQA1 hf-1b-GPT2-mqa1-fp32: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype float32 --batch_size ${batch_size} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size} hf-1b-GPT2-mqa1-bf16: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype bfloat16 --batch_size ${batch_size} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size} hf-1b-GPT2-mqa1-int8: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} - -ds-inference-1b-GPT2-mqa1-fp16: - deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size} # Input length experiments hf-1b-GPT2-mqa1-int8-input-length: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length} hf-1b-GPT2-mha-int8-input-length: - python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length} + python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length} diff --git a/requirements.txt b/requirements.txt index f6512732c78380..2b8ca551953a60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ accelerate==0.15.0 bitsandbytes deepspeed==0.7.7 -./transformers +-e ./transformers # TODO: Dev only isort>=5.5.4 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/src/main.py b/src/main.py index 30ec6a1ecdcca9..ec9ffc15b33dd1 100644 --- a/src/main.py +++ b/src/main.py @@ -1,19 +1,23 @@ -import pipelines -from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch +from typing import List, Optional +from src.pipelines import get_pipeline_class +from src.utils.arguments import parse_args +from src.utils.benchmark import benchmark_end_to_end +from src.utils.input import get_dummy_batch +from src.utils.logging import configure_logging -def main() -> None: - # deepspeed.init_distributed("nccl") - args = get_args(get_arg_parser()) +def main(argv: Optional[List[str]] = None) -> None: + args = parse_args(argv=argv) inputs = get_dummy_batch(args.batch_size, args.max_input_length) - generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False) + generate_kwargs = {"max_new_tokens": args.max_new_tokens, "do_sample": False} - pipeline_class = getattr(pipelines, args.pipeline_class) + pipeline_class = get_pipeline_class(args.pipeline_class) benchmark_end_to_end(args, pipeline_class, inputs, generate_kwargs) if __name__ == "__main__": + configure_logging() main() diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py index 2581940f705621..52027b5c0853ef 100644 --- a/src/pipelines/__init__.py +++ b/src/pipelines/__init__.py @@ -1,3 +1,11 @@ -from .ds_inference import DS_Inference_Pipeline -from .hf import HF_CPU_Pipeline, HF_GPU_Pipeline -from .pipeline import Pipeline +def get_pipeline_class(name): + if name == "HF_Pipeline": + from src.pipelines.transformers import HF_Pipeline + + return HF_Pipeline + elif name == "DS_Pipeline": + from src.pipelines.ds import DS_Pipeline + + return DS_Pipeline + else: + raise NotImplementedError(f"Unsupported pipeline class: {name}") diff --git a/src/pipelines/ds.py b/src/pipelines/ds.py new file mode 100644 index 00000000000000..047cfd676d4a25 --- /dev/null +++ b/src/pipelines/ds.py @@ -0,0 +1,25 @@ +import os +from argparse import Namespace + +import deepspeed +import torch + +from src.pipelines.pipeline import Pipeline +from src.utils.arguments import check_unused + + +class DS_Pipeline(Pipeline): + def __init__(self, args: Namespace) -> None: + check_unused(args, {"device": torch.device("cuda")}, enforce=True) + # TODO: Works with other dtypes? + check_unused(args, {"dtype": torch.float16}) + super().__init__(args) + + self.model = deepspeed.init_inference( + self.model, + mp_size=int(os.getenv("WORLD_SIZE", "1")), + # base_dir="./", + dtype=args.dtype, + replace_with_kernel_inject=args.inject_kernel, + enable_cuda_graph=args.cuda_graph, + ) diff --git a/src/pipelines/ds_inference.py b/src/pipelines/ds_inference.py deleted file mode 100644 index 96a27d5e46d476..00000000000000 --- a/src/pipelines/ds_inference.py +++ /dev/null @@ -1,39 +0,0 @@ -import os -from argparse import Namespace - -import deepspeed -import torch - -from .pipeline import Pipeline - - -class DS_Inference_Pipeline(Pipeline): - def __init__(self, args: Namespace) -> None: - super().__init__(args) - - world_size = int(os.getenv("WORLD_SIZE", "1")) - - # with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"): - # model = BloomForCausalLM._from_config(config, torch_dtype=torch.bfloat16) - self.model = self.model_class.from_pretrained("tmp", torch_dtype=torch.bfloat16) - self.model.eval() - - # checkpoints_json = os.path.join(args.model_name, "checkpoints.json") - - # if dist.get_rank() == 0: - # with io.open(checkpoints_json, "w", encoding="utf-8") as f: - # checkpoint_files = [str(entry) for entry in Path(args.model_name).rglob("*.[bp][it][n]") if entry.is_file()] - # data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0} - # json.dump(data, f) - # dist.barrier() - - self.model = deepspeed.init_inference( - self.model, - mp_size=world_size, - # base_dir="./", - dtype=torch.float16, - replace_with_kernel_inject=True - # checkpoint=checkpoints_json, - ) - - self.input_device = torch.cuda.current_device() diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py deleted file mode 100644 index 105007449ddf46..00000000000000 --- a/src/pipelines/hf.py +++ /dev/null @@ -1,32 +0,0 @@ -from argparse import Namespace - -import torch - -from .pipeline import Pipeline - - -class HF_Pipeline(Pipeline): - def __init__(self, args: Namespace, device: str = "cpu") -> None: - super().__init__(args) - - model_kwargs = {} - - if args.dtype == torch.int8: - model_kwargs["load_in_8bit"] = True - model_kwargs["device_map"] = "auto" - else: - model_kwargs["torch_dtype"] = args.dtype - - self.input_device = device - self.model = self.model_class.from_pretrained("tmp", **model_kwargs).to(self.input_device) - self.model.eval() - - -class HF_CPU_Pipeline(HF_Pipeline): - def __init__(self, args: Namespace) -> None: - super().__init__(args, "cpu") - - -class HF_GPU_Pipeline(HF_Pipeline): - def __init__(self, args: Namespace) -> None: - super().__init__(args, "cuda:0") diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py index fdbf40c8df539d..939992175cfaac 100644 --- a/src/pipelines/pipeline.py +++ b/src/pipelines/pipeline.py @@ -1,27 +1,112 @@ -import os +import gc +import logging +import time from argparse import Namespace -from typing import List, Tuple, Union +from typing import Any, Dict, List, Tuple, Type +import numpy as np import torch -from transformers import AutoTokenizer, BloomConfig, BloomForCausalLM, GPT2Config, GPT2LMHeadModel +from src.utils.arguments import check_unused +from src.utils.fast_init import fast_init +from src.utils.logging import format_ms, log_rank_n +from transformers import AutoTokenizer, BloomForCausalLM, GPT2LMHeadModel, PretrainedConfig, PreTrainedModel + + +logger = logging.getLogger(__name__) + +NUM_GENERATED_TOKENS = "num_generated_tokens" +TOKENIZE_TIME = "tokenize_time" +MODEL_TIME = "model_time" +DECODE_TIME = "decode_time" +END_TO_END_TIME = "end_to_end_time" + +METRIC_KEYS = ( + NUM_GENERATED_TOKENS, + TOKENIZE_TIME, + MODEL_TIME, + DECODE_TIME, + END_TO_END_TIME, +) class Pipeline: def __init__(self, args: Namespace) -> None: - self.config, self.tokenizer, self.model_class = get_config_tokenizer_model_class(args) - self.model = None - self.input_device = None - - def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[int]]: + log_rank_n("*** Setting up tokenizer", logger.info) + self.tokenizer = AutoTokenizer.from_pretrained("gpt2") + self.tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + self.device = args.device + + model_class, config = self.get_config(args) + is_int8 = args.dtype == torch.int8 + if is_int8: + check_unused(args, {"device": torch.device("cuda")}, enforce=True) + torch_dtype = torch.float16 if is_int8 else args.dtype + + log_rank_n("*** Creating model", logger.info) + with fast_init(self.device): + self.model = model_class._from_config(config=config, torch_dtype=torch_dtype) + log_rank_n("*** Moving to device", logger.info) + self.model.to(self.device) + log_rank_n("*** Initializing weights", logger.info) + # Initialization is ~1000x faster on GPU. + self.model.init_weights() + + # Int8 can only be obtained by reloading a pretrained model + if is_int8: + log_rank_n("*** Saving model", logger.info) + self.model.save_pretrained("tmp") + self.model = None + gc.collect() + torch.cuda.empty_cache() + log_rank_n("*** Reloading model in int8", logger.info) + with fast_init(self.device): + self.model = model_class.from_pretrained( + "tmp", + load_in_8bit=True, + device_map="auto", + ) + + self.model.eval() + + def get_config(self, args) -> Tuple[Type[PreTrainedModel], PretrainedConfig]: + config_args = { + "activation_function": args.activation_function, + "n_head": args.n_head, + "n_layer": args.n_layer, + "bos_token_id": self.tokenizer.bos_token_id, + "eos_token_id": self.tokenizer.eos_token_id, + "vocab_size": len(self.tokenizer), + "use_cache": True, + } + if args.model_class.lower() == "bloom": + check_unused(args, {"attention_type": 1, "n_positions": None}) + config_args["attention_softmax_in_fp32"] = True + config_args["hidden_size"] = args.hidden_size + model_class = BloomForCausalLM + elif args.model_class.lower() == "gpt2": + config_args["attention_type"] = args.attention_type + config_args["n_embd"] = args.hidden_size + config_args["n_positions"] = args.n_positions + model_class = GPT2LMHeadModel + else: + raise NotImplementedError() + + return model_class, model_class.config_class(**config_args) + + def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], Dict[str, Any]]: + t0 = time.perf_counter() input_tokens = self.tokenizer(text, return_tensors="pt", padding=True) for t in input_tokens: if torch.is_tensor(input_tokens[t]): - input_tokens[t] = input_tokens[t].to(self.input_device) + input_tokens[t] = input_tokens[t].to(self.device) + t1 = time.perf_counter() with torch.no_grad(): output = self.model.generate(**input_tokens, return_dict_in_generate=True, **generate_kwargs) + t2 = time.perf_counter() output_tokens = output.sequences @@ -30,48 +115,36 @@ def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[ num_generated_tokens = [o - i for i, o in zip(input_token_lengths, output_token_lengths)] output_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True) + t3 = time.perf_counter() + + metrics = { + NUM_GENERATED_TOKENS: num_generated_tokens, + TOKENIZE_TIME: t1 - t0, + MODEL_TIME: t2 - t1, + DECODE_TIME: t3 - t2, + END_TO_END_TIME: t3 - t0, + } - return output_text, num_generated_tokens + return output_text, metrics def get_num_parameters(self) -> int: - param_count = 0 - for i in self.model.parameters(): - param_count += i.numel() - return param_count - - -def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2Config]: - tokenizer = AutoTokenizer.from_pretrained("gpt2") - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - - if args.model_class.lower() == "bloom": - config = BloomConfig( - attention_softmax_in_fp32=True, - hidden_size=args.hidden_size, - n_head=args.n_head, - n_layer=args.n_layer, - vocab_size=len(tokenizer), - bos_token_id=tokenizer.bos_token_id, - eos_token_id=tokenizer.eos_token_id, - use_cache=True, - ) - model_class = BloomForCausalLM - elif args.model_class.lower() == "gpt2": - config = GPT2Config( - n_embd=args.hidden_size, - n_head=args.n_head, - n_layer=args.n_layer, - n_positions=args.n_positions, - bos_token_id=tokenizer.bos_token_id, - eos_token_id=tokenizer.eos_token_id, - attention_type=args.attention_type, - print_details=False, - vocab_size=len(tokenizer), - use_cache=True, - ) - model_class = GPT2LMHeadModel - - if not os.path.exists("tmp"): - model_class._from_config(config).save_pretrained("tmp") - - return config, tokenizer, model_class + return sum(p.numel() for p in self.model.parameters()) + + def aggregate_and_format_metrics(self, metrics: List[Dict[str, Any]]): + all_metrics = {key: [metrics_[key] for metrics_ in metrics if key in metrics_] for key in METRIC_KEYS} + mean_metrics = {key: np.mean(all_metrics[key]).item() for key in METRIC_KEYS if len(all_metrics[key]) > 0} + throughput = mean_metrics[NUM_GENERATED_TOKENS] / mean_metrics[END_TO_END_TIME] + model_throughput = mean_metrics[NUM_GENERATED_TOKENS] / mean_metrics[MODEL_TIME] + + return { + "Latency (end to end)": format_ms(mean_metrics[END_TO_END_TIME]), + "Latency (tokenization)": format_ms(mean_metrics[TOKENIZE_TIME]), + "Latency (model)": format_ms(mean_metrics[MODEL_TIME]), + "Latency (decode)": format_ms(mean_metrics[DECODE_TIME]), + "Latency (max)": format_ms(max(all_metrics[END_TO_END_TIME])), + "Latency (min)": format_ms(min(all_metrics[END_TO_END_TIME])), + "Tokens generated": f"{mean_metrics[NUM_GENERATED_TOKENS]:.0f}", + "Throughput (model)": f"{model_throughput:.2f} tokens/s", + "Throughput (end to end)": f"{throughput:.2f} tokens/s", + "Token time (end to end)": f"{format_ms(throughput ** -1)}/token", + } diff --git a/src/pipelines/transformers.py b/src/pipelines/transformers.py new file mode 100644 index 00000000000000..d94e7bc24987a4 --- /dev/null +++ b/src/pipelines/transformers.py @@ -0,0 +1,10 @@ +from argparse import Namespace + +from src.pipelines.pipeline import Pipeline +from src.utils.arguments import check_unused + + +class HF_Pipeline(Pipeline): + def __init__(self, args: Namespace) -> None: + check_unused(args, {"inject_kernel": False, "cuda_graph": False}) + super().__init__(args) diff --git a/src/utils/__init__.py b/src/utils/__init__.py index a94745694bf923..e69de29bb2d1d6 100644 --- a/src/utils/__init__.py +++ b/src/utils/__init__.py @@ -1,3 +0,0 @@ -from .arguments import get_arg_parser, get_args -from .benchmark import benchmark_end_to_end -from .dummy import get_dummy_batch diff --git a/src/utils/arguments.py b/src/utils/arguments.py index 158fbe3e3a0848..27c0a9ce7016cb 100644 --- a/src/utils/arguments.py +++ b/src/utils/arguments.py @@ -1,28 +1,76 @@ +import warnings from argparse import ArgumentParser, Namespace +from typing import Any, Dict import torch def get_arg_parser() -> ArgumentParser: parser = ArgumentParser() - parser.add_argument("--pipeline_class", default="HF_GPU_Pipeline", type=str) + + # Model parser.add_argument("--model_class", default="GPT2", type=str) - parser.add_argument("--batch_size", default=1, type=int) - parser.add_argument("--dtype", default="bfloat16", type=str) - parser.add_argument("--max_input_length", default=-1, type=int) - parser.add_argument("--max_new_tokens", default=100, type=int) - parser.add_argument("--local_rank", type=int) parser.add_argument("--hidden_size", type=int) parser.add_argument("--attention_type", type=int) parser.add_argument("--n_positions", type=int) parser.add_argument("--n_head", type=int) parser.add_argument("--n_layer", type=int) - parser.add_argument("--benchmark_cycles", type=int, default=5) + parser.add_argument("--activation_function", default="gelu_new_python") + + # Runtime + parser.add_argument("--pipeline_class", default="HF_Pipeline", type=str) + parser.add_argument("--device", default="cuda", type=torch.device) + parser.add_argument("--dtype", default="float16", type=lambda x: getattr(torch, x)) + parser.add_argument("--local_rank", type=int) + + # Input and output + parser.add_argument("--batch_size", default=1, type=int) + parser.add_argument("--max_input_length", default=-1, type=int) + parser.add_argument("--max_new_tokens", default=100, type=int) + + # Cleanup parser.add_argument("--clear_every_run", action="store_true") + + # Deepspeed + parser.add_argument("--no_inject_kernel", dest="inject_kernel", action="store_false") + parser.add_argument("--cuda_graph", action="store_true") + + # Benchmark cycles + parser.add_argument("--skip", type=int, default=1) + parser.add_argument("--warmup", type=int, default=None) + parser.add_argument("--cycles", type=int, default=5) + + # Profiling and logging + parser.add_argument("--max_log_outputs", default=None, type=int) + parser.add_argument("--profile", action="store_true") + parser.add_argument("--full_trace", action="store_true") + parser.add_argument("--show_op_names", action="store_true") + return parser -def get_args(parser: ArgumentParser) -> Namespace: - args = parser.parse_args() - args.dtype = getattr(torch, args.dtype) +def check_unused(args: Namespace, defaults: Dict[str, Any], enforce=False): + for name, default in defaults.items(): + val = getattr(args, name) + is_default = val is None if default is None else val == default + if not is_default: + warnings.warn( + f"{'Invalid' if enforce else 'Unexpected'} argument: --{name} (value =" + f" {val}, {'setting to' if enforce else 'expected'} {default})" + ) + if enforce: + setattr(args, name, default) + + +def parse_args(argv=None, parser: ArgumentParser = None) -> Namespace: + if parser is None: + parser = get_arg_parser() + args = parser.parse_args(argv) + + if args.warmup is None: + args.warmup = args.profile + + if args.max_log_outputs is None: + args.max_log_outputs = args.batch_size + return args diff --git a/src/utils/benchmark.py b/src/utils/benchmark.py index aa14961bdd2d59..146685af9365ff 100644 --- a/src/utils/benchmark.py +++ b/src/utils/benchmark.py @@ -1,76 +1,110 @@ +import contextlib import gc +import logging from argparse import Namespace from functools import partial -from typing import List +from typing import List, Type, Union import torch -from pipelines import Pipeline +from src.pipelines.pipeline import Pipeline +from src.utils.logging import format_ms, log_dict, log_rank_n +from src.utils.utils import run_and_log_time -from .utils import print_rank_n, run_and_log_time +logger = logging.getLogger(__name__) -def benchmark_generation(pipeline: Pipeline, text: List[str], generate_kwargs: dict, cycles: int = 5) -> int: - # run benchmarks for number of cycles - total_new_tokens_generated = 0 - for _ in range(cycles): - _, num_generated_tokens = pipeline(text, **generate_kwargs) - total_new_tokens_generated += sum(new_tokens for new_tokens in num_generated_tokens) - return total_new_tokens_generated +def get_trace_fn(args, rank=-1): + def trace_fn( + p: torch.profiler.profile, + ): + averages = p.key_averages() + if args.full_trace: + # Show every GPU op. + # Exclude CPU cuda ops to shorten the table. + events = torch.autograd.profiler.EventList( + [evt for evt in p.profiler.function_events if evt.self_cuda_time_total > 0] + ) + log_rank_n(events.table(row_limit=-1, max_src_column_width=1000), logger.info, rank) -def get_benchmark_results( - benchmark_time: float, initialization_time: float, total_new_tokens_generated: int, batch_size: int, cycles: int -) -> str: - throughput = total_new_tokens_generated / benchmark_time - latency = benchmark_time / cycles - return f""" -*** Performance stats: -Throughput (including tokenization) = {throughput:.2f} tokens/sec -Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token -Model loading time = {initialization_time:.2f} secs -Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size} -Latency = {latency:.2f} secs -Model loading time + generation time per batch = {initialization_time + latency:.2f} secs -""" + if args.show_op_names: + # Show non-cropped names, in the same order as in the table. + averages_sorted = torch.autograd.profiler.EventList( + sorted(averages, key=lambda evt: evt.self_cuda_time_total, reverse=True) + ) + for entry in averages_sorted: + log_rank_n(entry.key, logger.info, rank) + # Try to avoid name cropping, still hard-coded to max 55 characters + log_rank_n( + averages.table(sort_by="self_cuda_time_total", row_limit=-1, max_src_column_width=1000), logger.info, rank + ) -def benchmark_end_to_end(args: Namespace, pipeline_class: Pipeline, text: List[str], generate_kwargs: dict) -> None: + return trace_fn + + +def get_profiler(args: Namespace) -> Union[torch.profiler.profile, contextlib.nullcontext]: + schedule = torch.profiler.schedule( + # Warmup is a must if measuring speed as it's when all the optimizations are performed + # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs + skip_first=args.skip, + # Warmup for the profiler + warmup=args.warmup, + wait=0, + active=args.cycles, + ) + return torch.profiler.profile( + schedule=schedule, + activities=[torch.profiler.ProfilerActivity.CUDA], + on_trace_ready=get_trace_fn(args), + ) + + +def benchmark_end_to_end( + args: Namespace, + pipeline_class: Type[Pipeline], + text: List[str], + generate_kwargs: dict, +) -> None: + pipeline: Pipeline pipeline, initialization_time = run_and_log_time(partial(pipeline_class, args=args)) - print_rank_n("num params =", pipeline.get_num_parameters()) - - print_rank_n(f"generate_kwargs = {generate_kwargs}") - print_rank_n(f"batch_size = {args.batch_size}") - - # warmup is a must if measuring speed as it's when all the optimizations are performed - # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs - generated_text, _ = pipeline(text, **generate_kwargs) - - for i, o in zip(text, generated_text): - print_rank_n(f"{'-' * 60}\nINPUT = {i}\nOUTPUT = {o}\n") - - if args.benchmark_cycles > 0: - print_rank_n(f"*** Running benchmark") - - if args.clear_every_run: - torch.cuda.empty_cache() - gc.collect() - torch.cuda.synchronize() - - # benchmark - total_new_tokens_generated, benchmark_time = run_and_log_time( - partial( - benchmark_generation, - pipeline=pipeline, - text=text, - generate_kwargs=generate_kwargs, - cycles=args.benchmark_cycles, - ) - ) - - print_rank_n( - get_benchmark_results( - benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles - ) - ) + warmup = args.warmup + if warmup is None: + warmup = args.profile + + all_metrics = [] + + with (get_profiler(args) if args.profile else contextlib.nullcontext()) as p: + for step in range(args.skip + warmup + args.cycles): + generated_text, metrics = pipeline(text, **generate_kwargs) + if args.profile: + p.step() + + if step == 0: + for i, o, _ in zip(text, generated_text, range(args.max_log_outputs)): + log_rank_n(f"{'-' * 60}\nINPUT = {i}\nOUTPUT = {o}", logger.info) + + if step >= args.skip + warmup: + all_metrics.append(metrics) + + if args.clear_every_run: + torch.cuda.synchronize() + gc.collect() + torch.cuda.empty_cache() + + if len(all_metrics) > 0: + log_rank_n("*** Performance metrics:", logger.info) + log_dict(pipeline.aggregate_and_format_metrics(all_metrics), logger.info) + + log_rank_n("*** Benchmarking stats:", logger.info) + log_dict( + { + "Model initialization time": format_ms(initialization_time), + "Model parameters": pipeline.get_num_parameters(), + "Batch size": args.batch_size, + **generate_kwargs, + }, + logger.info, + ) diff --git a/src/utils/fast_init.py b/src/utils/fast_init.py new file mode 100644 index 00000000000000..dec45ad6102f45 --- /dev/null +++ b/src/utils/fast_init.py @@ -0,0 +1,52 @@ +import contextlib +from typing import Callable, Dict, Type + +import torch + +from transformers import Conv1D +from transformers.modeling_utils import no_init_weights + + +def _conv1d_init(self, nf, nx, device=None): + super(Conv1D, self).__init__() + self.nf = nf + w = torch.empty(nx, nf, device=device) + torch.nn.init.normal_(w, std=0.02) + self.weight = torch.nn.Parameter(w) + b = torch.empty(nf, device=device) + torch.nn.init.zeros_(b) + self.bias = torch.nn.Parameter(b) + + +_ORIGINAL_INITS: Dict[Type[torch.nn.Module], Callable] = { + Conv1D: _conv1d_init, + torch.nn.Linear: torch.nn.Linear.__init__, + torch.nn.Embedding: torch.nn.Embedding.__init__, + torch.nn.LayerNorm: torch.nn.LayerNorm.__init__, +} + + +def _get_fast_init(cls: Type[torch.nn.Module], device: torch.device): + assert cls in _ORIGINAL_INITS + + def _fast_init(self, *args, **kwargs): + # Same as torch.nn.utils.skip_init, excluding checks + _ORIGINAL_INITS[cls](self, *args, **kwargs, device="meta") + self.to_empty(device=device) + + return _fast_init + + +@contextlib.contextmanager +def fast_init(device: torch.device, init_weights: bool = False): + """ + Avoid multiple slow initializations on cpu. + """ + for cls in _ORIGINAL_INITS: + cls.__init__ = _get_fast_init(cls, device) + + with contextlib.nullcontext() if init_weights else no_init_weights(): + yield + + for cls in _ORIGINAL_INITS: + cls.__init__ = _ORIGINAL_INITS[cls] diff --git a/src/utils/dummy.py b/src/utils/input.py similarity index 100% rename from src/utils/dummy.py rename to src/utils/input.py diff --git a/src/utils/logging.py b/src/utils/logging.py new file mode 100644 index 00000000000000..4ec8a39f5d5759 --- /dev/null +++ b/src/utils/logging.py @@ -0,0 +1,45 @@ +import logging +import logging.config +from typing import Callable + +from torch import distributed as dist + + +def configure_logging(name=None): + logging_config = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "default": { + "format": f"%(asctime)s{'' if name is None else ' ['+name+']'}: %(message)s", + "use_colors": True, + } + }, + "handlers": { + "default": { + "level": "INFO", + "formatter": "default", + "class": "logging.StreamHandler", + "stream": "ext://sys.stdout", + } + }, + "loggers": {"default": {"level": "DEBUG", "handlers": ["default"]}}, + "root": {"handlers": ["default"], "level": "INFO"}, + } + logging.config.dictConfig(logging_config) + + +def log_rank_n(msg: str, logger: Callable = logging.info, rank: int = 0): + if rank < 0 or not dist.is_initialized() or dist.get_rank() == rank: + # Multi-line logs break formatting + for line in msg.splitlines(): + logger(line) + + +def log_dict(data: dict, logger: Callable = logging.info, rank: int = 0): + for key, value in data.items(): + log_rank_n(f"{key}: {value}", logger, rank) + + +def format_ms(t: float): + return f"{1000 * t:.2f} ms" diff --git a/src/utils/utils.py b/src/utils/utils.py index 2ae7b7f7dffc28..d678fe33389d5c 100644 --- a/src/utils/utils.py +++ b/src/utils/utils.py @@ -2,12 +2,10 @@ from functools import partial from typing import Any, List, Tuple, Union -import torch.distributed as dist - def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]: # runs a function / list of functions and times them - start_time = time.time() + start_time = time.perf_counter() if type(execs) == list: results = [] @@ -16,30 +14,5 @@ def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[A else: results = execs() - time_elapsed = time.time() - start_time + time_elapsed = time.perf_counter() - start_time return results, time_elapsed - - -def run_rank_n(func: partial, barrier: bool = False, rank: int = 0, other_rank_output: Any = None) -> Any: - # runs function on only process with specified rank - if dist.is_initialized(): - if dist.get_rank() == rank: - output = func() - if barrier: - dist.barrier() - return output - else: - if barrier: - dist.barrier() - return other_rank_output - else: - return func() - - -def print_rank_n(*values, rank: int = 0) -> None: - # print on only process with specified rank - if dist.is_initialized(): - if dist.get_rank() == rank: - print(*values) - else: - print(*values) diff --git a/transformers b/transformers index b7e2124cf72623..98319da5defd14 160000 --- a/transformers +++ b/transformers @@ -1 +1 @@ -Subproject commit b7e2124cf726235ccaefe17ff960e9117f86949c +Subproject commit 98319da5defd1462c05f7eba1b9e215c704b274e