Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merging main #4

Merged
merged 22 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
3e480e9
Fixed single GPU issue without setting up mp. Added toggles for serve…
gshtras Aug 2, 2024
42b1b9a
Add distributed executor backend to benchmark scripts (#118)
mawong-amd Aug 2, 2024
5fac73f
Add weight padding for moe (#119)
charlifu Aug 2, 2024
c034d5d
[BugFix] Fix navi build after many custom for MI kernels added (#116)
maleksan85 Aug 6, 2024
98f31cd
add emtpy_cache() after each padding (#120)
charlifu Aug 6, 2024
30f12f0
[FIX] Gradlib OOM on Navi and sometimes on MI (#124)
maleksan85 Aug 8, 2024
8608888
save shape when fp8 solution not found (#123)
charlifu Aug 8, 2024
f49dff3
Fix unit test for moe by adding padding (#128)
charlifu Aug 12, 2024
dd1a208
Llama3.1 (#129)
gshtras Aug 12, 2024
674da1d
chat/completions endpoint (#121)
gshtras Aug 13, 2024
636ff01
Optimize custom all reduce (#130)
iotamudelta Aug 14, 2024
d5bf9bc
Add BF16 support to custom PA (#133)
sanyalington Aug 14, 2024
4132cbe
Making check for output match in original types. It saves some memory…
maleksan85 Aug 14, 2024
4d2dda6
Make CAR ROCm 6.1 compatible. (#137)
iotamudelta Aug 14, 2024
e7c3a5c
Car revert (#140)
gshtras Aug 15, 2024
5945822
Using the correct datatypes for streaming non-chat completions (#134)
gshtras Aug 15, 2024
6a8793d
Adding UNREACHABLE_CODE macro for non MI300 and MI250 cards (#138)
maleksan85 Aug 15, 2024
7382dd5
gfx90a typo fix (#142)
maleksan85 Aug 16, 2024
cfab178
wvsplitk templatized and better tuned for MI300 (#132)
amd-hhashemi Aug 16, 2024
c1860d6
[Bugfix] Dockerfile.rocm (#141)
zstreet87 Aug 16, 2024
7c5fd50
Update test-template.j2 (#145)
okakarpa Aug 19, 2024
aa36718
Adding Triton implementations awq_dequantize and awq_gemm to ROCm (#136)
rasmith Aug 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ steps:
- "docker push {{ docker_image_amd }}"
plugins:
- docker-login#v3.0.0:
username: rocmshared
username: rocm
key: "amd-build"
env:
DOCKER_BUILDKIT: "1"
Expand All @@ -38,4 +38,4 @@ steps:
priority: 100
soft_fail: true
{% endif %}
{% endfor %}
{% endfor %}
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")

# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")

#
# Supported/expected torch versions for CUDA/ROCm.
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ USER root
ARG BASE_IMAGE
ARG COMMON_WORKDIR
# Used as ARCHes for all components
ARG PYTORCH_ROCM_ARCH="gfx90a;gfx942"
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ARG ARG_PYTORCH_ROCM_ARCH="gfx90a;gfx942"
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH}

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -
Expand Down
53 changes: 32 additions & 21 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,27 +19,30 @@ def main(args: argparse.Namespace):

# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
tokenizer=args.tokenizer,
quantization=args.quantization,
quantized_weights_path=args.quantized_weights_path,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
worker_use_ray=args.worker_use_ray,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
disable_custom_all_reduce=args.disable_custom_all_reduce,
gpu_memory_utilization=args.gpu_memory_utilization)
llm = LLM(
model=args.model,
speculative_model=args.speculative_model,
num_speculative_tokens=args.num_speculative_tokens,
tokenizer=args.tokenizer,
quantization=args.quantization,
quantized_weights_path=args.quantized_weights_path,
tensor_parallel_size=args.tensor_parallel_size,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
worker_use_ray=args.worker_use_ray,
use_v2_block_manager=args.use_v2_block_manager,
enable_chunked_prefill=args.enable_chunked_prefill,
download_dir=args.download_dir,
block_size=args.block_size,
disable_custom_all_reduce=args.disable_custom_all_reduce,
gpu_memory_utilization=args.gpu_memory_utilization,
distributed_executor_backend=args.distributed_executor_backend,
)

sampling_params = SamplingParams(
n=args.n,
Expand Down Expand Up @@ -237,5 +240,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
help='the fraction of GPU memory to be used for '
'the model executor, which can range from 0 to 1.'
'If unspecified, will use the default value of 0.9.')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp', 'torchrun'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, on CUDA this will be automatically set to "ray" if '
'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
'instead set to torchrun by default.')
args = parser.parse_args()
main(args)
15 changes: 13 additions & 2 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def run_vllm(
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
worker_use_ray: bool = False,
download_dir: Optional[str] = None,
Expand All @@ -104,6 +105,7 @@ def run_vllm(
download_dir=download_dir,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
)

# Add the requests to the engine.
Expand Down Expand Up @@ -229,8 +231,9 @@ def main(args: argparse.Namespace):
args.max_model_len, args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.gpu_memory_utilization,
args.worker_use_ray, args.download_dir)
args.max_num_batched_tokens, args.distributed_executor_backend,
args.gpu_memory_utilization, args.worker_use_ray,
args.download_dir)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
Expand Down Expand Up @@ -384,6 +387,14 @@ def main(args: argparse.Namespace):
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
parser.add_argument(
'--distributed-executor-backend',
choices=['ray', 'mp', 'torchrun'],
default=None,
help='Backend to use for distributed serving. When more than 1 GPU '
'is used, on CUDA this will be automatically set to "ray" if '
'installed or "mp" (multiprocessing) otherwise. On ROCm, this is '
'instead set to torchrun by default.')
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/kernels/benchmark_paged_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from vllm._custom_C import paged_attention_custom
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random

NUM_BLOCKS = 1024
NUM_BLOCKS = 1024 * 1024
PARTITION_SIZE = 256


Expand Down Expand Up @@ -176,7 +176,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
if do_profile:
latency = run_benchmark(num_iters=1, profile=True)
else:
latency = run_benchmark(num_iters=100, profile=False)
latency = run_benchmark(num_iters=1000, profile=False)
print(f"Kernel running time: {latency * 1000000:.3f} us")


Expand Down
Loading