Skip to content

Commit

Permalink
Profiling and misc (huggingface#10)
Browse files Browse the repository at this point in the history
  • Loading branch information
jlamypoirier authored Jan 30, 2023
1 parent 36cacf0 commit 4be387d
Show file tree
Hide file tree
Showing 19 changed files with 454 additions and 274 deletions.
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ RUN useradd -m -u $USER -s /bin/bash $USERNAME \

COPY --chown=$USERNAME ./requirements.txt ./
COPY --chown=$USERNAME transformers/ ./transformers
RUN pip install -r requirements.txt

# Stock version of pip doesn't work with editable transformers.
RUN pip install --upgrade pip --no-cache-dir && pip install -r requirements.txt --no-cache-dir

COPY --chown=$USERNAME src/ ./src
52 changes: 16 additions & 36 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,69 +6,49 @@ style:

batch_size := 1

install-mqa-transformers:
git clone https://github.com/bigcode-project/transformers.git; \
cd transformers; \
git checkout mayank/multi_query; \
pip install .; \
cd ..; \
rm -rf transformers;
install:
git submodule update --init
pip install -r requirements.txt

# BLOOM AliBi
hf-1b-bloom-fp32:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype float32 --batch_size ${batch_size}

hf-1b-bloom-bf16:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype bfloat16 --batch_size ${batch_size}

hf-1b-bloom-int8:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class BLOOM --dtype int8 --batch_size ${batch_size}

ds-inference-1b-bloom-fp16:
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class BLOOM --batch_size ${batch_size}
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class BLOOM --batch_size ${batch_size}

# GPT2 MHA
hf-1b-GPT2-mha-fp32:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype float32 --batch_size ${batch_size}

hf-1b-GPT2-mha-bf16:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype bfloat16 --batch_size ${batch_size}

hf-1b-GPT2-mha-int8:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size}

ds-inference-1b-GPT2-mha-fp16:
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}

# GPT2 MQA
hf-1b-GPT2-mqa-fp32:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size}

hf-1b-GPT2-mqa-bf16:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size}

hf-1b-GPT2-mqa-int8:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}

ds-inference-1b-GPT2-mqa-fp16:
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --batch_size ${batch_size}
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --batch_size ${batch_size}

# GPT2 MQA1
hf-1b-GPT2-mqa1-fp32:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype float32 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype float32 --batch_size ${batch_size}

hf-1b-GPT2-mqa1-bf16:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype bfloat16 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype bfloat16 --batch_size ${batch_size}

hf-1b-GPT2-mqa1-int8:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size}

ds-inference-1b-GPT2-mqa1-fp16:
deepspeed --num_gpus 1 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class DS_Inference_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --batch_size ${batch_size}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size}

# Input length experiments
hf-1b-GPT2-mqa1-int8-input-length:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 3 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 2 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}

hf-1b-GPT2-mha-int8-input-length:
python src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_GPU_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
python3 src/main.py --hidden_size 2048 --n_head 16 --n_layer 24 --pipeline_class HF_Pipeline --model_class GPT2 --n_positions 2048 --attention_type 1 --dtype int8 --batch_size ${batch_size} --max_input_length ${max_input_length}
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
accelerate==0.15.0
bitsandbytes
deepspeed==0.7.7
./transformers
-e ./transformers

# TODO: Dev only
isort>=5.5.4
Expand Down
Empty file added src/__init__.py
Empty file.
18 changes: 11 additions & 7 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
import pipelines
from utils import benchmark_end_to_end, get_arg_parser, get_args, get_dummy_batch
from typing import List, Optional

from src.pipelines import get_pipeline_class
from src.utils.arguments import parse_args
from src.utils.benchmark import benchmark_end_to_end
from src.utils.input import get_dummy_batch
from src.utils.logging import configure_logging

def main() -> None:
# deepspeed.init_distributed("nccl")

args = get_args(get_arg_parser())
def main(argv: Optional[List[str]] = None) -> None:
args = parse_args(argv=argv)

inputs = get_dummy_batch(args.batch_size, args.max_input_length)

generate_kwargs = dict(max_new_tokens=args.max_new_tokens, do_sample=False)
generate_kwargs = {"max_new_tokens": args.max_new_tokens, "do_sample": False}

pipeline_class = getattr(pipelines, args.pipeline_class)
pipeline_class = get_pipeline_class(args.pipeline_class)
benchmark_end_to_end(args, pipeline_class, inputs, generate_kwargs)


if __name__ == "__main__":
configure_logging()
main()
14 changes: 11 additions & 3 deletions src/pipelines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
from .ds_inference import DS_Inference_Pipeline
from .hf import HF_CPU_Pipeline, HF_GPU_Pipeline
from .pipeline import Pipeline
def get_pipeline_class(name):
if name == "HF_Pipeline":
from src.pipelines.transformers import HF_Pipeline

return HF_Pipeline
elif name == "DS_Pipeline":
from src.pipelines.ds import DS_Pipeline

return DS_Pipeline
else:
raise NotImplementedError(f"Unsupported pipeline class: {name}")
25 changes: 25 additions & 0 deletions src/pipelines/ds.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import os
from argparse import Namespace

import deepspeed
import torch

from src.pipelines.pipeline import Pipeline
from src.utils.arguments import check_unused


class DS_Pipeline(Pipeline):
def __init__(self, args: Namespace) -> None:
check_unused(args, {"device": torch.device("cuda")}, enforce=True)
# TODO: Works with other dtypes?
check_unused(args, {"dtype": torch.float16})
super().__init__(args)

self.model = deepspeed.init_inference(
self.model,
mp_size=int(os.getenv("WORLD_SIZE", "1")),
# base_dir="./",
dtype=args.dtype,
replace_with_kernel_inject=args.inject_kernel,
enable_cuda_graph=args.cuda_graph,
)
39 changes: 0 additions & 39 deletions src/pipelines/ds_inference.py

This file was deleted.

32 changes: 0 additions & 32 deletions src/pipelines/hf.py

This file was deleted.

Loading

0 comments on commit 4be387d

Please sign in to comment.