Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ghstack] Add support for more shapes #2021

Draft
wants to merge 3 commits into
base: gh/jainapurva/28/head
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions benchmarks/microbenchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ model_params:
[2048, 4096, 1024],
[4096, 4096, 1024]
]
- name: "llama"
- name: "pow2"
min_power: 10 # Optional, default is 10 (1024)
max_power: 14 # Optional, default is 14 (16,384)
- name: "pow2_extended"
min_power: 10 # Optional, default is 10 (1024)
max_power: 14 # Optional, default is 14 (16,384)
- name: "sweep"
min_power: 8 # Optional, default is 8 (256)
max_power: 15 # Optional, default is 15 (32,768)
high_precision_dtype: "torch.bfloat16"
compile: "max-autotune" # Options: "default", "max-autotune", "false"
device: "cuda" # Options: "cuda", "mps", "xpu", "cpu"
Expand All @@ -54,6 +64,13 @@ model_params:
## Configuration Options
### Shape Generation Options
- `custom`: Manually specify shapes as a list of [m, k, n] dimensions
- `llama`: Use LLaMa 2 70B single-node weight shapes (assumes fused attn.wqkv and ffn.w13)
- `pow2`: Generate shapes with dimensions that are powers of 2 (e.g., 1024, 2048, 4096, etc.)
- `pow2_extended`: Generate shapes with dimensions that are powers of 2 and powers of 2 + half (e.g., 1024, 1536, 2048, 3072, etc.)
- `sweep`: Generate a sweep of shapes with different powers of 2 for M, K, N dimensions

### Quantization Methods
Currently, quantization string is in same format as the one being passed in llama/generate.py.
- `baseline`: No quantization
Expand Down
43 changes: 42 additions & 1 deletion benchmarks/microbenchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,50 @@ def get_shapes_for_config(
name = shape_config["name"]
if name == "custom":
shapes.extend([(name, shape) for shape in shape_config["shapes"]])
elif name == "llama":
# LLaMa 2 70B single-node weight shapes
# assumes fused attn.wqkv and ffn.w13
bsz, seq_len = 4, 4096
M = bsz * seq_len
llama_shapes = {
"attn.wqkv": (M, 8192, 1280),
"attn.w0": (M, 1024, 8192),
"ffn.w13": (M, 8192, 7168),
"ffn.w2": (M, 3584, 8192),
}
shapes.extend([(f"{name}_{k}", v) for k, v in llama_shapes.items()])
elif name == "pow2":
# Generate shapes with dimensions that are powers of 2
min_power_of_2 = shape_config.get("min_power", 10) # 1024
max_power_of_2 = shape_config.get("max_power", 14) # 16,384
for idx, power_of_2 in enumerate(range(min_power_of_2, max_power_of_2 + 1)):
val = 2**power_of_2
shapes.append((f"{name}_{idx}", [val, val, val]))
elif name == "pow2_extended":
# Generate shapes with dimensions that are powers of 2 and powers of 2 + half
min_power_of_2 = shape_config.get("min_power", 10) # 1024
max_power_of_2 = shape_config.get("max_power", 14) # 16,384
for idx, power_of_2 in enumerate(range(min_power_of_2, max_power_of_2 + 1)):
val1 = 2**power_of_2
val2 = 2**power_of_2 + 2 ** (power_of_2 - 1)
shapes.append((f"{name}_{idx*2}", [val1, val1, val1]))
shapes.append((f"{name}_{idx*2+1}", [val2, val2, val2]))
elif name == "sweep":
# Generate a sweep of shapes with different powers of 2 for M, K, N
min_p2 = shape_config.get("min_power", 8) # 256
max_p2 = shape_config.get("max_power", 15) # 32,768
counter = 0
for M_p2 in range(min_p2, max_p2 + 1):
M = 2**M_p2
for K_p2 in range(min_p2, max_p2 + 1):
K = 2**K_p2
for N_p2 in range(min_p2, max_p2 + 1):
N = 2**N_p2
shapes.append((f"{name}_{counter}", [M, K, N]))
counter += 1
else:
raise NotImplementedError(
f"Shape config {name} not supported. Currently only supports custom shapes."
f"Shape config {name} not supported. Supported options: custom, llama, pow2, pow2_extended, sweep."
)
return shapes

Expand Down
94 changes: 55 additions & 39 deletions benchmarks/microbenchmarks/test/benchmark_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,29 @@
benchmark_mode: "inference"
quantization_config_recipe_names:
# Will run a baseline inference for model by default, without quantization for comparison
# - "int4wo-32"
- "int4wo-32"
# - "marlin"
- "int8wo"
- "int8dq"
- "float8dq"
# sparsity_config_recipe_names:
# Will run a baseline inference for model by default, without sparsity for comparison
# - "semi-sparse"
# - "block"
output_dir: "benchmarks/microbenchmarks/results"
model_params:
# - name: "small_bf16_linear"
# matrix_shapes:
# - name: "custom"
# shapes: [
# [1024, 1024, 1024], # [m, k, n]
# ]
# high_precision_dtype: "torch.bfloat16"
# use_torch_compile: true
# torch_compile_mode: "max-autotune"
# device: "cuda"
# model_type: "linear"
# enable_profiler: true # Enable profiling for this model
- name: "small_bf16_linear"
matrix_shapes:
- name: "custom"
shapes: [
[1024, 1024, 1024], # [m, k, n]
]
high_precision_dtype: "torch.bfloat16"
use_torch_compile: true
torch_compile_mode: "max-autotune"
device: "cuda"
model_type: "linear"
enable_profiler: true # Enable profiling for this model

- name: "large_bf16_ln_linear"
matrix_shapes:
Expand All @@ -31,6 +33,20 @@ model_params:
[2048, 4096, 1024],
# [4096, 4096, 1024]
]
# Example of using LLaMa shapes
- name: "llama"
# Example of using power of 2 shapes
- name: "pow2"
min_power: 10 # 1024
max_power: 12 # 4096
# Example of using extended power of 2 shapes
- name: "pow2_extended"
min_power: 10 # 1024
max_power: 11 # 2048
# Example of using sweep shapes (commented out as it generates many shapes)
# - name: "sweep"
# min_power: 8 # 256
# max_power: 9 # 512
high_precision_dtype: "torch.bfloat16"
use_torch_compile: true
torch_compile_mode: "max-autotune"
Expand All @@ -51,30 +67,30 @@ model_params:
# model_type: "linear"
# enable_profiler: true # Enable profiling for this model

- name: "bf16_rms_norm_linear_activation"
matrix_shapes:
- name: "custom"
shapes: [
[2048, 4096, 1024],
]
high_precision_dtype: "torch.bfloat16"
use_torch_compile: true
torch_compile_mode: "max-autotune"
device: "cuda"
model_type: "rms_norm_linear_activation"
enable_profiler: true
enable_memory_profile: true
# - name: "bf16_rms_norm_linear_activation"
# matrix_shapes:
# - name: "custom"
# shapes: [
# [2048, 4096, 1024],
# ]
# high_precision_dtype: "torch.bfloat16"
# use_torch_compile: true
# torch_compile_mode: "max-autotune"
# device: "cuda"
# model_type: "rms_norm_linear_activation"
# enable_profiler: true
# enable_memory_profile: true

- name: "bf16_transformer_block"
matrix_shapes:
- name: "custom"
shapes: [
[2048, 4096, 1024], # For transformer_block, k is the hidden dimension
]
high_precision_dtype: "torch.bfloat16"
use_torch_compile: true
torch_compile_mode: "max-autotune"
device: "cuda"
model_type: "transformer_block"
enable_profiler: true
enable_memory_profile: true
# - name: "bf16_transformer_block"
# matrix_shapes:
# - name: "custom"
# shapes: [
# [2048, 4096, 1024], # For transformer_block, k is the hidden dimension
# ]
# high_precision_dtype: "torch.bfloat16"
# use_torch_compile: true
# torch_compile_mode: "max-autotune"
# device: "cuda"
# model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition)
# enable_profiler: true
# enable_memory_profile: true
60 changes: 60 additions & 0 deletions benchmarks/microbenchmarks/test/test_benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,72 @@ def tearDown(self):
shutil.rmtree(self.temp_dir)

def test_get_shapes_for_config(self):
# Test custom shapes
shapes = get_shapes_for_config(
self.test_config["model_params"][0]["matrix_shapes"]
)
self.assertEqual(len(shapes), 1)
self.assertEqual(shapes[0], ("custom", [1024, 1024, 1024]))

# Test llama shapes
llama_shapes = get_shapes_for_config([{"name": "llama"}])
self.assertEqual(len(llama_shapes), 4) # 4 LLaMa shapes
self.assertTrue(
any(name.startswith("llama_attn.wqkv") for name, _ in llama_shapes)
)
self.assertTrue(
any(name.startswith("llama_attn.w0") for name, _ in llama_shapes)
)
self.assertTrue(
any(name.startswith("llama_ffn.w13") for name, _ in llama_shapes)
)
self.assertTrue(
any(name.startswith("llama_ffn.w2") for name, _ in llama_shapes)
)

# Test pow2 shapes
pow2_shapes = get_shapes_for_config(
[{"name": "pow2", "min_power": 10, "max_power": 12}]
)
self.assertEqual(len(pow2_shapes), 3) # 3 powers of 2 (10, 11, 12)
self.assertEqual(pow2_shapes[0], ("pow2_0", [1024, 1024, 1024])) # 2^10
self.assertEqual(pow2_shapes[1], ("pow2_1", [2048, 2048, 2048])) # 2^11
self.assertEqual(pow2_shapes[2], ("pow2_2", [4096, 4096, 4096])) # 2^12

# Test pow2_extended shapes
pow2_extended_shapes = get_shapes_for_config(
[{"name": "pow2_extended", "min_power": 10, "max_power": 11}]
)
self.assertEqual(
len(pow2_extended_shapes), 4
) # 2 powers of 2, each with 2 variants
self.assertEqual(
pow2_extended_shapes[0], ("pow2_extended_0", [1024, 1024, 1024])
) # 2^10
self.assertEqual(
pow2_extended_shapes[1], ("pow2_extended_1", [1536, 1536, 1536])
) # 2^10 + 2^9
self.assertEqual(
pow2_extended_shapes[2], ("pow2_extended_2", [2048, 2048, 2048])
) # 2^11
self.assertEqual(
pow2_extended_shapes[3], ("pow2_extended_3", [3072, 3072, 3072])
) # 2^11 + 2^10

# Test sweep shapes (limited to a small range for testing)
sweep_shapes = get_shapes_for_config(
[{"name": "sweep", "min_power": 8, "max_power": 9}]
)
# For min_power=8, max_power=9, we should have 8 shapes (2^3 = 8 combinations)
self.assertEqual(len(sweep_shapes), 8)
# Check that all shapes have the expected format
for name, shape in sweep_shapes:
self.assertTrue(name.startswith("sweep_"))
self.assertEqual(len(shape), 3) # [M, K, N]
# Check that all dimensions are powers of 2 between 2^8 and 2^9
for dim in shape:
self.assertTrue(dim in [256, 512]) # 2^8, 2^9

def test_get_param_combinations(self):
model_param = self.test_config["model_params"][0]
shapes, params = get_param_combinations(model_param)
Expand Down
Loading
Loading