From 4be387d23b61ff1a6df2367aad3510f525535560 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 30 Jan 2023 18:39:23 -0500
Subject: [PATCH] Profiling and misc (#10)

---
 Dockerfile                       |   4 +-
 Makefile                         |  52 +++------
 requirements.txt                 |   2 +-
 src/__init__.py                  |   0
 src/main.py                      |  18 ++--
 src/pipelines/__init__.py        |  14 ++-
 src/pipelines/ds.py              |  25 +++++
 src/pipelines/ds_inference.py    |  39 -------
 src/pipelines/hf.py              |  32 ------
 src/pipelines/pipeline.py        | 175 ++++++++++++++++++++++---------
 src/pipelines/transformers.py    |  10 ++
 src/utils/__init__.py            |   3 -
 src/utils/arguments.py           |  68 ++++++++++--
 src/utils/benchmark.py           | 156 ++++++++++++++++-----------
 src/utils/fast_init.py           |  52 +++++++++
 src/utils/{dummy.py => input.py} |   0
 src/utils/logging.py             |  45 ++++++++
 src/utils/utils.py               |  31 +-----
 transformers                     |   2 +-
 19 files changed, 454 insertions(+), 274 deletions(-)
 create mode 100644 src/__init__.py
 create mode 100644 src/pipelines/ds.py
 delete mode 100644 src/pipelines/ds_inference.py
 delete mode 100644 src/pipelines/hf.py
 create mode 100644 src/pipelines/transformers.py
 create mode 100644 src/utils/fast_init.py
 rename src/utils/{dummy.py => input.py} (100%)
 create mode 100644 src/utils/logging.py

diff --git a/Dockerfile b/Dockerfile
index 2b24dc242a74b3..c8a786fa239ee1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,6 +11,8 @@ RUN useradd -m -u $USER -s /bin/bash $USERNAME \
 
 COPY --chown=$USERNAME ./requirements.txt ./
 COPY --chown=$USERNAME transformers/ ./transformers
-RUN pip install -r requirements.txt
+
+# Stock version of pip doesn't work with editable transformers.
+RUN pip install --upgrade pip --no-cache-dir && pip install -r requirements.txt --no-cache-dir
 
 COPY --chown=$USERNAME src/ ./src
diff --git a/Makefile b/Makefile
index 3d73df02bd3f19..c147da5bb560d3 100644
--- a/Makefile
+++ b/Makefile
@@ -6,69 +6,49 @@ style:
 
 batch_size := 1
 
-install-mqa-transformers:
-	git clone https://github.com/bigcode-project/transformers.git; \
-	cd transformers; \
-	git checkout mayank/multi_query; \
-	pip install .; \
-	cd ..; \
-	rm -rf transformers;
+install:
+	git submodule update --init
+	pip install -r requirements.txt
 
 # BLOOM AliBi
 hf-1b-bloom-fp32:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size}
 
 hf-1b-bloom-bf16:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size}
 
 hf-1b-bloom-int8:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}
 
 ds-inference-1b-bloom-fp16:
-	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class BLOOM --batch_size ${batch_size}
+	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class BLOOM --batch_size ${batch_size}
 
 # GPT2 MHA
 hf-1b-GPT2-mha-fp32:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size}
 
 hf-1b-GPT2-mha-bf16:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size}
 
 hf-1b-GPT2-mha-int8:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}
 
 ds-inference-1b-GPT2-mha-fp16:
-	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}
-
-# GPT2 MQA
-hf-1b-GPT2-mqa-fp32:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size}
-
-hf-1b-GPT2-mqa-bf16:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size}
-
-hf-1b-GPT2-mqa-int8:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}
-
-ds-inference-1b-GPT2-mqa-fp16:
-	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --batch_size ${batch_size}
+	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}
 
 # GPT2 MQA1
 hf-1b-GPT2-mqa1-fp32:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype float32 --batch_size ${batch_size}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size}
 
 hf-1b-GPT2-mqa1-bf16:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype bfloat16 --batch_size ${batch_size}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size}
 
 hf-1b-GPT2-mqa1-int8:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size}
-
-ds-inference-1b-GPT2-mqa1-fp16:
-	deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}
 
 # Input length experiments
 hf-1b-GPT2-mqa1-int8-input-length:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
 
 hf-1b-GPT2-mha-int8-input-length:
-	python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
+	python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
diff --git a/requirements.txt b/requirements.txt
index f6512732c78380..2b8ca551953a60 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 accelerate==0.15.0
 bitsandbytes
 deepspeed==0.7.7
-./transformers
+-e ./transformers
 
 # TODO: Dev only
 isort>=5.5.4
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/src/main.py b/src/main.py
index 30ec6a1ecdcca9..ec9ffc15b33dd1 100644
--- a/src/main.py
+++ b/src/main.py
@@ -1,19 +1,23 @@
-import pipelines
-from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch
+from typing import List, Optional
 
+from src.pipelines import get_pipeline_class
+from src.utils.arguments import parse_args
+from src.utils.benchmark import benchmark_end_to_end
+from src.utils.input import get_dummy_batch
+from src.utils.logging import configure_logging
 
-def main() -> None:
-    # deepspeed.init_distributed("nccl")
 
-    args = get_args(get_arg_parser())
+def main(argv: Optional[List[str]] = None) -> None:
+    args = parse_args(argv=argv)
 
     inputs = get_dummy_batch(args.batch_size, args.max_input_length)
 
-    generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)
+    generate_kwargs = {"max_new_tokens": args.max_new_tokens, "do_sample": False}
 
-    pipeline_class = getattr(pipelines, args.pipeline_class)
+    pipeline_class = get_pipeline_class(args.pipeline_class)
     benchmark_end_to_end(args, pipeline_class, inputs, generate_kwargs)
 
 
 if __name__ == "__main__":
+    configure_logging()
     main()
diff --git a/src/pipelines/__init__.py b/src/pipelines/__init__.py
index 2581940f705621..52027b5c0853ef 100644
--- a/src/pipelines/__init__.py
+++ b/src/pipelines/__init__.py
@@ -1,3 +1,11 @@
-from .ds_inference import DS_Inference_Pipeline
-from .hf import HF_CPU_Pipeline, HF_GPU_Pipeline
-from .pipeline import Pipeline
+def get_pipeline_class(name):
+    if name == "HF_Pipeline":
+        from src.pipelines.transformers import HF_Pipeline
+
+        return HF_Pipeline
+    elif name == "DS_Pipeline":
+        from src.pipelines.ds import DS_Pipeline
+
+        return DS_Pipeline
+    else:
+        raise NotImplementedError(f"Unsupported pipeline class: {name}")
diff --git a/src/pipelines/ds.py b/src/pipelines/ds.py
new file mode 100644
index 00000000000000..047cfd676d4a25
--- /dev/null
+++ b/src/pipelines/ds.py
@@ -0,0 +1,25 @@
+import os
+from argparse import Namespace
+
+import deepspeed
+import torch
+
+from src.pipelines.pipeline import Pipeline
+from src.utils.arguments import check_unused
+
+
+class DS_Pipeline(Pipeline):
+    def __init__(self, args: Namespace) -> None:
+        check_unused(args, {"device": torch.device("cuda")}, enforce=True)
+        # TODO: Works with other dtypes?
+        check_unused(args, {"dtype": torch.float16})
+        super().__init__(args)
+
+        self.model = deepspeed.init_inference(
+            self.model,
+            mp_size=int(os.getenv("WORLD_SIZE", "1")),
+            # base_dir="./",
+            dtype=args.dtype,
+            replace_with_kernel_inject=args.inject_kernel,
+            enable_cuda_graph=args.cuda_graph,
+        )
diff --git a/src/pipelines/ds_inference.py b/src/pipelines/ds_inference.py
deleted file mode 100644
index 96a27d5e46d476..00000000000000
--- a/src/pipelines/ds_inference.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import os
-from argparse import Namespace
-
-import deepspeed
-import torch
-
-from .pipeline import Pipeline
-
-
-class DS_Inference_Pipeline(Pipeline):
-    def __init__(self, args: Namespace) -> None:
-        super().__init__(args)
-
-        world_size = int(os.getenv("WORLD_SIZE", "1"))
-
-        # with deepspeed.OnDevice(dtype=torch.bfloat16, device="meta"):
-        #     model = BloomForCausalLM._from_config(config, torch_dtype=torch.bfloat16)
-        self.model = self.model_class.from_pretrained("tmp", torch_dtype=torch.bfloat16)
-        self.model.eval()
-
-        # checkpoints_json = os.path.join(args.model_name, "checkpoints.json")
-
-        # if dist.get_rank() == 0:
-        #     with io.open(checkpoints_json, "w", encoding="utf-8") as f:
-        #         checkpoint_files = [str(entry) for entry in Path(args.model_name).rglob("*.[bp][it][n]") if entry.is_file()]
-        #         data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0}
-        #         json.dump(data, f)
-        # dist.barrier()
-
-        self.model = deepspeed.init_inference(
-            self.model,
-            mp_size=world_size,
-            # base_dir="./",
-            dtype=torch.float16,
-            replace_with_kernel_inject=True
-            # checkpoint=checkpoints_json,
-        )
-
-        self.input_device = torch.cuda.current_device()
diff --git a/src/pipelines/hf.py b/src/pipelines/hf.py
deleted file mode 100644
index 105007449ddf46..00000000000000
--- a/src/pipelines/hf.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from argparse import Namespace
-
-import torch
-
-from .pipeline import Pipeline
-
-
-class HF_Pipeline(Pipeline):
-    def __init__(self, args: Namespace, device: str = "cpu") -> None:
-        super().__init__(args)
-
-        model_kwargs = {}
-
-        if args.dtype == torch.int8:
-            model_kwargs["load_in_8bit"] = True
-            model_kwargs["device_map"] = "auto"
-        else:
-            model_kwargs["torch_dtype"] = args.dtype
-
-        self.input_device = device
-        self.model = self.model_class.from_pretrained("tmp", **model_kwargs).to(self.input_device)
-        self.model.eval()
-
-
-class HF_CPU_Pipeline(HF_Pipeline):
-    def __init__(self, args: Namespace) -> None:
-        super().__init__(args, "cpu")
-
-
-class HF_GPU_Pipeline(HF_Pipeline):
-    def __init__(self, args: Namespace) -> None:
-        super().__init__(args, "cuda:0")
diff --git a/src/pipelines/pipeline.py b/src/pipelines/pipeline.py
index fdbf40c8df539d..939992175cfaac 100644
--- a/src/pipelines/pipeline.py
+++ b/src/pipelines/pipeline.py
@@ -1,27 +1,112 @@
-import os
+import gc
+import logging
+import time
 from argparse import Namespace
-from typing import List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Type
 
+import numpy as np
 import torch
 
-from transformers import AutoTokenizer, BloomConfig, BloomForCausalLM, GPT2Config, GPT2LMHeadModel
+from src.utils.arguments import check_unused
+from src.utils.fast_init import fast_init
+from src.utils.logging import format_ms, log_rank_n
+from transformers import AutoTokenizer, BloomForCausalLM, GPT2LMHeadModel, PretrainedConfig, PreTrainedModel
+
+
+logger = logging.getLogger(__name__)
+
+NUM_GENERATED_TOKENS = "num_generated_tokens"
+TOKENIZE_TIME = "tokenize_time"
+MODEL_TIME = "model_time"
+DECODE_TIME = "decode_time"
+END_TO_END_TIME = "end_to_end_time"
+
+METRIC_KEYS = (
+    NUM_GENERATED_TOKENS,
+    TOKENIZE_TIME,
+    MODEL_TIME,
+    DECODE_TIME,
+    END_TO_END_TIME,
+)
 
 
 class Pipeline:
     def __init__(self, args: Namespace) -> None:
-        self.config, self.tokenizer, self.model_class = get_config_tokenizer_model_class(args)
-        self.model = None
-        self.input_device = None
-
-    def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[int]]:
+        log_rank_n("*** Setting up tokenizer", logger.info)
+        self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+
+        self.device = args.device
+
+        model_class, config = self.get_config(args)
+        is_int8 = args.dtype == torch.int8
+        if is_int8:
+            check_unused(args, {"device": torch.device("cuda")}, enforce=True)
+        torch_dtype = torch.float16 if is_int8 else args.dtype
+
+        log_rank_n("*** Creating model", logger.info)
+        with fast_init(self.device):
+            self.model = model_class._from_config(config=config, torch_dtype=torch_dtype)
+        log_rank_n("*** Moving to device", logger.info)
+        self.model.to(self.device)
+        log_rank_n("*** Initializing weights", logger.info)
+        # Initialization is ~1000x faster on GPU.
+        self.model.init_weights()
+
+        # Int8 can only be obtained by reloading a pretrained model
+        if is_int8:
+            log_rank_n("*** Saving model", logger.info)
+            self.model.save_pretrained("tmp")
+            self.model = None
+            gc.collect()
+            torch.cuda.empty_cache()
+            log_rank_n("*** Reloading model in int8", logger.info)
+            with fast_init(self.device):
+                self.model = model_class.from_pretrained(
+                    "tmp",
+                    load_in_8bit=True,
+                    device_map="auto",
+                )
+
+        self.model.eval()
+
+    def get_config(self, args) -> Tuple[Type[PreTrainedModel], PretrainedConfig]:
+        config_args = {
+            "activation_function": args.activation_function,
+            "n_head": args.n_head,
+            "n_layer": args.n_layer,
+            "bos_token_id": self.tokenizer.bos_token_id,
+            "eos_token_id": self.tokenizer.eos_token_id,
+            "vocab_size": len(self.tokenizer),
+            "use_cache": True,
+        }
+        if args.model_class.lower() == "bloom":
+            check_unused(args, {"attention_type": 1, "n_positions": None})
+            config_args["attention_softmax_in_fp32"] = True
+            config_args["hidden_size"] = args.hidden_size
+            model_class = BloomForCausalLM
+        elif args.model_class.lower() == "gpt2":
+            config_args["attention_type"] = args.attention_type
+            config_args["n_embd"] = args.hidden_size
+            config_args["n_positions"] = args.n_positions
+            model_class = GPT2LMHeadModel
+        else:
+            raise NotImplementedError()
+
+        return model_class, model_class.config_class(**config_args)
+
+    def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], Dict[str, Any]]:
+        t0 = time.perf_counter()
         input_tokens = self.tokenizer(text, return_tensors="pt", padding=True)
 
         for t in input_tokens:
             if torch.is_tensor(input_tokens[t]):
-                input_tokens[t] = input_tokens[t].to(self.input_device)
+                input_tokens[t] = input_tokens[t].to(self.device)
 
+        t1 = time.perf_counter()
         with torch.no_grad():
             output = self.model.generate(**input_tokens, return_dict_in_generate=True, **generate_kwargs)
+        t2 = time.perf_counter()
 
         output_tokens = output.sequences
 
@@ -30,48 +115,36 @@ def __call__(self, text: List[str], **generate_kwargs) -> Tuple[List[str], List[
         num_generated_tokens = [o - i for i, o in zip(input_token_lengths, output_token_lengths)]
 
         output_text = self.tokenizer.batch_decode(output_tokens, skip_special_tokens=True)
+        t3 = time.perf_counter()
+
+        metrics = {
+            NUM_GENERATED_TOKENS: num_generated_tokens,
+            TOKENIZE_TIME: t1 - t0,
+            MODEL_TIME: t2 - t1,
+            DECODE_TIME: t3 - t2,
+            END_TO_END_TIME: t3 - t0,
+        }
 
-        return output_text, num_generated_tokens
+        return output_text, metrics
 
     def get_num_parameters(self) -> int:
-        param_count = 0
-        for i in self.model.parameters():
-            param_count += i.numel()
-        return param_count
-
-
-def get_config_tokenizer_model_class(args: Namespace) -> Union[BloomConfig, GPT2Config]:
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-
-    if args.model_class.lower() == "bloom":
-        config = BloomConfig(
-            attention_softmax_in_fp32=True,
-            hidden_size=args.hidden_size,
-            n_head=args.n_head,
-            n_layer=args.n_layer,
-            vocab_size=len(tokenizer),
-            bos_token_id=tokenizer.bos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            use_cache=True,
-        )
-        model_class = BloomForCausalLM
-    elif args.model_class.lower() == "gpt2":
-        config = GPT2Config(
-            n_embd=args.hidden_size,
-            n_head=args.n_head,
-            n_layer=args.n_layer,
-            n_positions=args.n_positions,
-            bos_token_id=tokenizer.bos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            attention_type=args.attention_type,
-            print_details=False,
-            vocab_size=len(tokenizer),
-            use_cache=True,
-        )
-        model_class = GPT2LMHeadModel
-
-    if not os.path.exists("tmp"):
-        model_class._from_config(config).save_pretrained("tmp")
-
-    return config, tokenizer, model_class
+        return sum(p.numel() for p in self.model.parameters())
+
+    def aggregate_and_format_metrics(self, metrics: List[Dict[str, Any]]):
+        all_metrics = {key: [metrics_[key] for metrics_ in metrics if key in metrics_] for key in METRIC_KEYS}
+        mean_metrics = {key: np.mean(all_metrics[key]).item() for key in METRIC_KEYS if len(all_metrics[key]) > 0}
+        throughput = mean_metrics[NUM_GENERATED_TOKENS] / mean_metrics[END_TO_END_TIME]
+        model_throughput = mean_metrics[NUM_GENERATED_TOKENS] / mean_metrics[MODEL_TIME]
+
+        return {
+            "Latency (end to end)": format_ms(mean_metrics[END_TO_END_TIME]),
+            "Latency (tokenization)": format_ms(mean_metrics[TOKENIZE_TIME]),
+            "Latency (model)": format_ms(mean_metrics[MODEL_TIME]),
+            "Latency (decode)": format_ms(mean_metrics[DECODE_TIME]),
+            "Latency (max)": format_ms(max(all_metrics[END_TO_END_TIME])),
+            "Latency (min)": format_ms(min(all_metrics[END_TO_END_TIME])),
+            "Tokens generated": f"{mean_metrics[NUM_GENERATED_TOKENS]:.0f}",
+            "Throughput (model)": f"{model_throughput:.2f} tokens/s",
+            "Throughput (end to end)": f"{throughput:.2f} tokens/s",
+            "Token time (end to end)": f"{format_ms(throughput ** -1)}/token",
+        }
diff --git a/src/pipelines/transformers.py b/src/pipelines/transformers.py
new file mode 100644
index 00000000000000..d94e7bc24987a4
--- /dev/null
+++ b/src/pipelines/transformers.py
@@ -0,0 +1,10 @@
+from argparse import Namespace
+
+from src.pipelines.pipeline import Pipeline
+from src.utils.arguments import check_unused
+
+
+class HF_Pipeline(Pipeline):
+    def __init__(self, args: Namespace) -> None:
+        check_unused(args, {"inject_kernel": False, "cuda_graph": False})
+        super().__init__(args)
diff --git a/src/utils/__init__.py b/src/utils/__init__.py
index a94745694bf923..e69de29bb2d1d6 100644
--- a/src/utils/__init__.py
+++ b/src/utils/__init__.py
@@ -1,3 +0,0 @@
-from .arguments import get_arg_parser, get_args
-from .benchmark import benchmark_end_to_end
-from .dummy import get_dummy_batch
diff --git a/src/utils/arguments.py b/src/utils/arguments.py
index 158fbe3e3a0848..27c0a9ce7016cb 100644
--- a/src/utils/arguments.py
+++ b/src/utils/arguments.py
@@ -1,28 +1,76 @@
+import warnings
 from argparse import ArgumentParser, Namespace
+from typing import Any, Dict
 
 import torch
 
 
 def get_arg_parser() -> ArgumentParser:
     parser = ArgumentParser()
-    parser.add_argument("--pipeline_class", default="HF_GPU_Pipeline", type=str)
+
+    # Model
     parser.add_argument("--model_class", default="GPT2", type=str)
-    parser.add_argument("--batch_size", default=1, type=int)
-    parser.add_argument("--dtype", default="bfloat16", type=str)
-    parser.add_argument("--max_input_length", default=-1, type=int)
-    parser.add_argument("--max_new_tokens", default=100, type=int)
-    parser.add_argument("--local_rank", type=int)
     parser.add_argument("--hidden_size", type=int)
     parser.add_argument("--attention_type", type=int)
     parser.add_argument("--n_positions", type=int)
     parser.add_argument("--n_head", type=int)
     parser.add_argument("--n_layer", type=int)
-    parser.add_argument("--benchmark_cycles", type=int, default=5)
+    parser.add_argument("--activation_function", default="gelu_new_python")
+
+    # Runtime
+    parser.add_argument("--pipeline_class", default="HF_Pipeline", type=str)
+    parser.add_argument("--device", default="cuda", type=torch.device)
+    parser.add_argument("--dtype", default="float16", type=lambda x: getattr(torch, x))
+    parser.add_argument("--local_rank", type=int)
+
+    # Input and output
+    parser.add_argument("--batch_size", default=1, type=int)
+    parser.add_argument("--max_input_length", default=-1, type=int)
+    parser.add_argument("--max_new_tokens", default=100, type=int)
+
+    # Cleanup
     parser.add_argument("--clear_every_run", action="store_true")
+
+    # Deepspeed
+    parser.add_argument("--no_inject_kernel", dest="inject_kernel", action="store_false")
+    parser.add_argument("--cuda_graph", action="store_true")
+
+    # Benchmark cycles
+    parser.add_argument("--skip", type=int, default=1)
+    parser.add_argument("--warmup", type=int, default=None)
+    parser.add_argument("--cycles", type=int, default=5)
+
+    # Profiling and logging
+    parser.add_argument("--max_log_outputs", default=None, type=int)
+    parser.add_argument("--profile", action="store_true")
+    parser.add_argument("--full_trace", action="store_true")
+    parser.add_argument("--show_op_names", action="store_true")
+
     return parser
 
 
-def get_args(parser: ArgumentParser) -> Namespace:
-    args = parser.parse_args()
-    args.dtype = getattr(torch, args.dtype)
+def check_unused(args: Namespace, defaults: Dict[str, Any], enforce=False):
+    for name, default in defaults.items():
+        val = getattr(args, name)
+        is_default = val is None if default is None else val == default
+        if not is_default:
+            warnings.warn(
+                f"{'Invalid' if enforce else 'Unexpected'} argument: --{name} (value ="
+                f" {val}, {'setting to' if enforce else 'expected'} {default})"
+            )
+            if enforce:
+                setattr(args, name, default)
+
+
+def parse_args(argv=None, parser: ArgumentParser = None) -> Namespace:
+    if parser is None:
+        parser = get_arg_parser()
+    args = parser.parse_args(argv)
+
+    if args.warmup is None:
+        args.warmup = args.profile
+
+    if args.max_log_outputs is None:
+        args.max_log_outputs = args.batch_size
+
     return args
diff --git a/src/utils/benchmark.py b/src/utils/benchmark.py
index aa14961bdd2d59..146685af9365ff 100644
--- a/src/utils/benchmark.py
+++ b/src/utils/benchmark.py
@@ -1,76 +1,110 @@
+import contextlib
 import gc
+import logging
 from argparse import Namespace
 from functools import partial
-from typing import List
+from typing import List, Type, Union
 
 import torch
 
-from pipelines import Pipeline
+from src.pipelines.pipeline import Pipeline
+from src.utils.logging import format_ms, log_dict, log_rank_n
+from src.utils.utils import run_and_log_time
 
-from .utils import print_rank_n, run_and_log_time
 
+logger = logging.getLogger(__name__)
 
-def benchmark_generation(pipeline: Pipeline, text: List[str], generate_kwargs: dict, cycles: int = 5) -> int:
-    # run benchmarks for number of cycles
-    total_new_tokens_generated = 0
-    for _ in range(cycles):
-        _, num_generated_tokens = pipeline(text, **generate_kwargs)
-        total_new_tokens_generated += sum(new_tokens for new_tokens in num_generated_tokens)
-    return total_new_tokens_generated
 
+def get_trace_fn(args, rank=-1):
+    def trace_fn(
+        p: torch.profiler.profile,
+    ):
+        averages = p.key_averages()
+        if args.full_trace:
+            # Show every GPU op.
+            # Exclude CPU cuda ops to shorten the table.
+            events = torch.autograd.profiler.EventList(
+                [evt for evt in p.profiler.function_events if evt.self_cuda_time_total > 0]
+            )
+            log_rank_n(events.table(row_limit=-1, max_src_column_width=1000), logger.info, rank)
 
-def get_benchmark_results(
-    benchmark_time: float, initialization_time: float, total_new_tokens_generated: int, batch_size: int, cycles: int
-) -> str:
-    throughput = total_new_tokens_generated / benchmark_time
-    latency = benchmark_time / cycles
-    return f"""
-*** Performance stats:
-Throughput (including tokenization) = {throughput:.2f} tokens/sec
-Throughput (including tokenization) = {1000 / throughput:.2f} msecs/token
-Model loading time = {initialization_time:.2f} secs
-Total tokens generated = {total_new_tokens_generated} with batch size = {batch_size}
-Latency = {latency:.2f} secs
-Model loading time + generation time per batch = {initialization_time + latency:.2f} secs
-"""
+        if args.show_op_names:
+            # Show non-cropped names, in the same order as in the table.
+            averages_sorted = torch.autograd.profiler.EventList(
+                sorted(averages, key=lambda evt: evt.self_cuda_time_total, reverse=True)
+            )
+            for entry in averages_sorted:
+                log_rank_n(entry.key, logger.info, rank)
 
+        # Try to avoid name cropping, still hard-coded to max 55 characters
+        log_rank_n(
+            averages.table(sort_by="self_cuda_time_total", row_limit=-1, max_src_column_width=1000), logger.info, rank
+        )
 
-def benchmark_end_to_end(args: Namespace, pipeline_class: Pipeline, text: List[str], generate_kwargs: dict) -> None:
+    return trace_fn
+
+
+def get_profiler(args: Namespace) -> Union[torch.profiler.profile, contextlib.nullcontext]:
+    schedule = torch.profiler.schedule(
+        # Warmup is a must if measuring speed as it's when all the optimizations are performed
+        # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
+        skip_first=args.skip,
+        # Warmup for the profiler
+        warmup=args.warmup,
+        wait=0,
+        active=args.cycles,
+    )
+    return torch.profiler.profile(
+        schedule=schedule,
+        activities=[torch.profiler.ProfilerActivity.CUDA],
+        on_trace_ready=get_trace_fn(args),
+    )
+
+
+def benchmark_end_to_end(
+    args: Namespace,
+    pipeline_class: Type[Pipeline],
+    text: List[str],
+    generate_kwargs: dict,
+) -> None:
+    pipeline: Pipeline
     pipeline, initialization_time = run_and_log_time(partial(pipeline_class, args=args))
 
-    print_rank_n("num params =", pipeline.get_num_parameters())
-
-    print_rank_n(f"generate_kwargs = {generate_kwargs}")
-    print_rank_n(f"batch_size = {args.batch_size}")
-
-    # warmup is a must if measuring speed as it's when all the optimizations are performed
-    # e.g. on 8x80 a100 the first pass of 100 tokens takes 23sec, and the next one is 4secs
-    generated_text, _ = pipeline(text, **generate_kwargs)
-
-    for i, o in zip(text, generated_text):
-        print_rank_n(f"{'-' * 60}\nINPUT = {i}\nOUTPUT = {o}\n")
-
-    if args.benchmark_cycles > 0:
-        print_rank_n(f"*** Running benchmark")
-
-        if args.clear_every_run:
-            torch.cuda.empty_cache()
-            gc.collect()
-            torch.cuda.synchronize()
-
-        # benchmark
-        total_new_tokens_generated, benchmark_time = run_and_log_time(
-            partial(
-                benchmark_generation,
-                pipeline=pipeline,
-                text=text,
-                generate_kwargs=generate_kwargs,
-                cycles=args.benchmark_cycles,
-            )
-        )
-
-        print_rank_n(
-            get_benchmark_results(
-                benchmark_time, initialization_time, total_new_tokens_generated, args.batch_size, args.benchmark_cycles
-            )
-        )
+    warmup = args.warmup
+    if warmup is None:
+        warmup = args.profile
+
+    all_metrics = []
+
+    with (get_profiler(args) if args.profile else contextlib.nullcontext()) as p:
+        for step in range(args.skip + warmup + args.cycles):
+            generated_text, metrics = pipeline(text, **generate_kwargs)
+            if args.profile:
+                p.step()
+
+            if step == 0:
+                for i, o, _ in zip(text, generated_text, range(args.max_log_outputs)):
+                    log_rank_n(f"{'-' * 60}\nINPUT = {i}\nOUTPUT = {o}", logger.info)
+
+            if step >= args.skip + warmup:
+                all_metrics.append(metrics)
+
+            if args.clear_every_run:
+                torch.cuda.synchronize()
+                gc.collect()
+                torch.cuda.empty_cache()
+
+    if len(all_metrics) > 0:
+        log_rank_n("*** Performance metrics:", logger.info)
+        log_dict(pipeline.aggregate_and_format_metrics(all_metrics), logger.info)
+
+    log_rank_n("*** Benchmarking stats:", logger.info)
+    log_dict(
+        {
+            "Model initialization time": format_ms(initialization_time),
+            "Model parameters": pipeline.get_num_parameters(),
+            "Batch size": args.batch_size,
+            **generate_kwargs,
+        },
+        logger.info,
+    )
diff --git a/src/utils/fast_init.py b/src/utils/fast_init.py
new file mode 100644
index 00000000000000..dec45ad6102f45
--- /dev/null
+++ b/src/utils/fast_init.py
@@ -0,0 +1,52 @@
+import contextlib
+from typing import Callable, Dict, Type
+
+import torch
+
+from transformers import Conv1D
+from transformers.modeling_utils import no_init_weights
+
+
+def _conv1d_init(self, nf, nx, device=None):
+    super(Conv1D, self).__init__()
+    self.nf = nf
+    w = torch.empty(nx, nf, device=device)
+    torch.nn.init.normal_(w, std=0.02)
+    self.weight = torch.nn.Parameter(w)
+    b = torch.empty(nf, device=device)
+    torch.nn.init.zeros_(b)
+    self.bias = torch.nn.Parameter(b)
+
+
+_ORIGINAL_INITS: Dict[Type[torch.nn.Module], Callable] = {
+    Conv1D: _conv1d_init,
+    torch.nn.Linear: torch.nn.Linear.__init__,
+    torch.nn.Embedding: torch.nn.Embedding.__init__,
+    torch.nn.LayerNorm: torch.nn.LayerNorm.__init__,
+}
+
+
+def _get_fast_init(cls: Type[torch.nn.Module], device: torch.device):
+    assert cls in _ORIGINAL_INITS
+
+    def _fast_init(self, *args, **kwargs):
+        # Same as torch.nn.utils.skip_init, excluding checks
+        _ORIGINAL_INITS[cls](self, *args, **kwargs, device="meta")
+        self.to_empty(device=device)
+
+    return _fast_init
+
+
+@contextlib.contextmanager
+def fast_init(device: torch.device, init_weights: bool = False):
+    """
+    Avoid multiple slow initializations on cpu.
+    """
+    for cls in _ORIGINAL_INITS:
+        cls.__init__ = _get_fast_init(cls, device)
+
+    with contextlib.nullcontext() if init_weights else no_init_weights():
+        yield
+
+    for cls in _ORIGINAL_INITS:
+        cls.__init__ = _ORIGINAL_INITS[cls]
diff --git a/src/utils/dummy.py b/src/utils/input.py
similarity index 100%
rename from src/utils/dummy.py
rename to src/utils/input.py
diff --git a/src/utils/logging.py b/src/utils/logging.py
new file mode 100644
index 00000000000000..4ec8a39f5d5759
--- /dev/null
+++ b/src/utils/logging.py
@@ -0,0 +1,45 @@
+import logging
+import logging.config
+from typing import Callable
+
+from torch import distributed as dist
+
+
+def configure_logging(name=None):
+    logging_config = {
+        "version": 1,
+        "disable_existing_loggers": False,
+        "formatters": {
+            "default": {
+                "format": f"%(asctime)s{'' if name is None else ' ['+name+']'}: %(message)s",
+                "use_colors": True,
+            }
+        },
+        "handlers": {
+            "default": {
+                "level": "INFO",
+                "formatter": "default",
+                "class": "logging.StreamHandler",
+                "stream": "ext://sys.stdout",
+            }
+        },
+        "loggers": {"default": {"level": "DEBUG", "handlers": ["default"]}},
+        "root": {"handlers": ["default"], "level": "INFO"},
+    }
+    logging.config.dictConfig(logging_config)
+
+
+def log_rank_n(msg: str, logger: Callable = logging.info, rank: int = 0):
+    if rank < 0 or not dist.is_initialized() or dist.get_rank() == rank:
+        # Multi-line logs break formatting
+        for line in msg.splitlines():
+            logger(line)
+
+
+def log_dict(data: dict, logger: Callable = logging.info, rank: int = 0):
+    for key, value in data.items():
+        log_rank_n(f"{key}: {value}", logger, rank)
+
+
+def format_ms(t: float):
+    return f"{1000 * t:.2f} ms"
diff --git a/src/utils/utils.py b/src/utils/utils.py
index 2ae7b7f7dffc28..d678fe33389d5c 100644
--- a/src/utils/utils.py
+++ b/src/utils/utils.py
@@ -2,12 +2,10 @@
 from functools import partial
 from typing import Any, List, Tuple, Union
 
-import torch.distributed as dist
-
 
 def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[Any], Any], float]:
     # runs a function / list of functions and times them
-    start_time = time.time()
+    start_time = time.perf_counter()
 
     if type(execs) == list:
         results = []
@@ -16,30 +14,5 @@ def run_and_log_time(execs: Union[List[partial], partial]) -> Tuple[Union[List[A
     else:
         results = execs()
 
-    time_elapsed = time.time() - start_time
+    time_elapsed = time.perf_counter() - start_time
     return results, time_elapsed
-
-
-def run_rank_n(func: partial, barrier: bool = False, rank: int = 0, other_rank_output: Any = None) -> Any:
-    # runs function on only process with specified rank
-    if dist.is_initialized():
-        if dist.get_rank() == rank:
-            output = func()
-            if barrier:
-                dist.barrier()
-            return output
-        else:
-            if barrier:
-                dist.barrier()
-            return other_rank_output
-    else:
-        return func()
-
-
-def print_rank_n(*values, rank: int = 0) -> None:
-    # print on only process with specified rank
-    if dist.is_initialized():
-        if dist.get_rank() == rank:
-            print(*values)
-    else:
-        print(*values)
diff --git a/transformers b/transformers
index b7e2124cf72623..98319da5defd14 160000
--- a/transformers
+++ b/transformers
@@ -1 +1 @@
-Subproject commit b7e2124cf726235ccaefe17ff960e9117f86949c
+Subproject commit 98319da5defd1462c05f7eba1b9e215c704b274e