Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace Nvidia-smi with devicesmi #761

Merged
merged 5 commits into from
Dec 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 4 additions & 40 deletions gptqmodel/nn_modules/qlinear/bitblas_target_detector.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
# TODO replace with device-smi checks

# Copyright (c) Microsoft Corporation.
# License: GPTQModel/licenses/LICENSE.mit
import os
import subprocess
# License: GPTQModel/licenses/LICENSE.apache
from typing import List

from device_smi import Device
from thefuzz import process
from tvm.target import Target
from tvm.target.tag import list_tags
Expand All @@ -19,38 +15,6 @@
"where <target> is one of the available targets can be found in the output of `tools/get_available_targets.py`."
)


def get_gpu_model_from_nvidia_smi(gpu_id: int = 0):
"""
Executes the 'nvidia-smi' command to fetch the name of the first available NVIDIA GPU.

Returns:
str: The name of the GPU, or None if 'nvidia-smi' command fails.
"""
try:
# Execute nvidia-smi command to get the GPU name
output = subprocess.check_output(
["nvidia-smi", "--query-gpu=gpu_name", "--format=csv,noheader"],
encoding="utf-8",
).strip()
except subprocess.CalledProcessError as e:
# print(f"nvidia-smi error: {e}")
logger.info("nvidia-smi failed with error: %s", e)
return None

gpus = output.split("\n")

# for multiple cpus, CUDA_DEVICE_ORDER=PCI_BUS_ID must be set to match nvidia-smi or else gpu_id is
# most likely incorrect and the wrong gpu
if len(gpus) > 1 and os.environ.get("CUDA_DEVICE_ORDER") != "PCI_BUS_ID":
raise EnvironmentError("Multi-gpu environment must set `CUDA_DEVICE_ORDER=PCI_BUS_ID`.")

if gpu_id >= len(gpus) or gpu_id < 0:
raise ValueError(f"Passed gpu_id:{gpu_id} but there are {len(gpus)} detected Nvidia gpus.")

return gpus[gpu_id]


def find_best_match(tags, query):
"""
Finds the best match for a query within a list of tags using fuzzy string matching.
Expand Down Expand Up @@ -96,12 +60,12 @@ def patched_auto_detect_nvidia_target(gpu_id: int = 0) -> str:
nvidia_tags = [tag for tag in all_tags if "nvidia" in tag]

# Get the current GPU model and find the best matching target
gpu_model = get_gpu_model_from_nvidia_smi(gpu_id=gpu_id)
gpu_model = Device(f"cuda:{gpu_id}").model
# print(f"gpu_model: {gpu_model}")

# compat: Nvidia makes several oem (non-public) versions of A100 and perhaps other models that
# do not have clearly defined TVM matching target so we need to manually map them to the correct one.
if gpu_model in ["NVIDIA PG506-230", "NVIDIA PG506-232"]:
if gpu_model in ["pg506-230", "pg506-232"]:
gpu_model = "NVIDIA A100"

CSY-ModelCloud marked this conversation as resolved.
Show resolved Hide resolved
# print("GPU_model",gpu_model)
Expand Down
Loading