diff --git a/models/Qwen2.5-7B-Instruct-v8-k65536-65536-woft b/models/Qwen2.5-7B-Instruct-v8-k65536-65536-woft new file mode 160000 index 0000000..ce2453d --- /dev/null +++ b/models/Qwen2.5-7B-Instruct-v8-k65536-65536-woft @@ -0,0 +1 @@ +Subproject commit ce2453de7e14a574ab4129fb57be017d1e0353fb diff --git a/requirements.txt b/requirements.txt index a475439..cbbdb55 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,6 @@ torch transformers>=4.44 safetensors psutil -accelerate \ No newline at end of file +accelerate +gradio +plotly==5.9.0 \ No newline at end of file diff --git a/vptq/app.py b/vptq/app.py index 9eaf859..91a19b0 100644 --- a/vptq/app.py +++ b/vptq/app.py @@ -7,6 +7,7 @@ import gradio as gr +from vptq.app_gpu import update_charts as _update_charts from vptq.app_utils import get_chat_loop_generator chat_completion = get_chat_loop_generator("VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k32768-0-woft") @@ -48,21 +49,30 @@ def respond( """ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface """ -demo = gr.ChatInterface( - respond, - additional_inputs=[ - gr.Textbox(value="You are a friendly Chatbot.", label="System message"), - gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), - gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), - gr.Slider( - minimum=0.1, - maximum=1.0, - value=0.95, - step=0.05, - label="Top-p (nucleus sampling)", - ), - ], -) +with gr.Blocks(fill_height=True) as demo: + with gr.Row(): + + def update_chart(): + return _update_charts(chart_height=200) + + gpu_chart = gr.Plot(update_chart, every=0.01) # update every 0.01 seconds + + with gr.Column(): + chat_interface = gr.ChatInterface( + respond, + additional_inputs=[ + gr.Textbox(value="You are a friendly Chatbot.", label="System message"), + gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), + gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), + gr.Slider( + minimum=0.1, + maximum=1.0, + value=0.95, + step=0.05, + label="Top-p (nucleus sampling)", + ), + ], + ) if __name__ == "__main__": share = os.getenv("SHARE_LINK", None) in ["1", "true", "True"] diff --git a/vptq/app_gpu.py b/vptq/app_gpu.py new file mode 100644 index 0000000..1eaa307 --- /dev/null +++ b/vptq/app_gpu.py @@ -0,0 +1,172 @@ +import re +import subprocess +from collections import deque + +import gradio as gr +import plotly.graph_objs as go + +# Queues for storing historical data (saving the last 20 GPU utilization and memory usage values) +gpu_util_history = deque(maxlen=20) +mem_usage_history = deque(maxlen=20) + + +def get_nvidia_smi_info(): + result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, text=True) + return result.stdout + + +def parse_nvidia_smi_output(output): + gpu_info = {} + utilization = re.search(r'(\d+)%\s+Default', output) + mem_used = re.search(r'(\d+)MiB / (\d+)MiB', output) + temp = re.search(r'(\d+)C', output) + power = re.search(r'(\d+)\s*/\s*(\d+)\s*W', output) + gpu_clock = re.search(r'(\d+)MHz\s+MEM\s+(\d+)MHz', output) + + if utilization: + gpu_info['gpu_util'] = int(utilization.group(1)) + if mem_used: + gpu_info['mem_used'] = int(mem_used.group(1)) + gpu_info['mem_total'] = int(mem_used.group(2)) + gpu_info['mem_percent'] = gpu_info['mem_used'] / gpu_info['mem_total'] * 100 + if temp: + gpu_info['temp'] = int(temp.group(1)) + if power: + gpu_info['power_used'] = int(power.group(1)) + gpu_info['power_max'] = int(power.group(2)) + if gpu_clock: + gpu_info['gpu_clock'] = int(gpu_clock.group(1)) + gpu_info['mem_clock'] = int(gpu_clock.group(2)) + + return gpu_info + + +def update_charts(chart_height: int = 200) -> go.Figure: + """ + Update the GPU utilization and memory usage charts. + + Args: + chart_height (int, optional): used to set the height of the chart. Defaults to 200. + + Returns: + plotly.graph_objs.Figure: The updated figure containing the GPU and memory usage charts. + """ + # obtain GPU information + output = get_nvidia_smi_info() + gpu_info = parse_nvidia_smi_output(output) + + # records the latest GPU utilization and memory usage values + gpu_util = round(gpu_info.get('gpu_util', 0), 1) + mem_percent = round(gpu_info.get('mem_percent', 0), 1) + gpu_util_history.append(gpu_util) + mem_usage_history.append(mem_percent) + + # create GPU utilization line chart + gpu_trace = go.Scatter(y=list(gpu_util_history), + mode='lines+markers+text', + name='GPU Utilization (%)', + text=list(gpu_util_history), + textposition='top center') + + # create memory usage line chart + mem_trace = go.Scatter(y=list(mem_usage_history), + mode='lines+markers+text', + name='Memory Usage (%)', + text=list(mem_usage_history), + textposition='top center') + + # set the layout of the chart + layout = go.Layout( + # title="Real-time GPU Stats", + xaxis=dict(title=None, showticklabels=False, ticks=''), + yaxis=dict( + title='Percentage (%)', + range=[-5, 110] # adjust the range of the y-axis + ), + height=chart_height, # set the height of the chart + margin=dict(l=10, r=10, t=0, b=0) # set the margin of the chart + ) + + fig = go.Figure(data=[gpu_trace, mem_trace], layout=layout) + return fig + + +def mem_bar(used: float, total: float) -> str: + """ + Generates a memory usage bar. + + Args: + used (float): The amount of memory used in GiB. + total (float): The total amount of memory available in GiB. + Returns: + str: A string representing the memory usage bar in HTML format. + """ + bar_length = 50 + used_bars = int(bar_length * used / total) + bar = '|' * used_bars + ' ' * (bar_length - used_bars) + return f"MEM[{bar}{used:.3f}Gi/{total:.3f}Gi]" + + +def refresh_gpu_data(): + """ + Refreshes and returns the current GPU data in an HTML formatted string. + + Returns: + str: An HTML formatted string containing the GPU information, including + GPU clock speed, memory clock speed, temperature, power usage, + GPU utilization, and memory usage. + """ + + output = get_nvidia_smi_info() + gpu_info = parse_nvidia_smi_output(output) + + gpu_clock = gpu_info.get('gpu_clock', 'N/A') + mem_clock = gpu_info.get('mem_clock', 'N/A') + temp = gpu_info.get('temp', 'N/A') + power_used = gpu_info.get('power_used', 'N/A') + power_max = gpu_info.get('power_max', 'N/A') + gpu_util = gpu_info.get('gpu_util', 0) + mem_used = gpu_info.get('mem_used', 0) / 1024 # MiB to GiB + mem_total = gpu_info.get('mem_total', 0) / 1024 # MiB to GiB + + gpu_info_display = (f"