Skip to content

Commit bea92ea

Browse files
authored
feat: profiling PVC updates for better UX (#2402)
1 parent c91e2e4 commit bea92ea

File tree

5 files changed

+780
-26
lines changed

5 files changed

+780
-26
lines changed
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
apiVersion: v1
5+
kind: Pod
6+
metadata:
7+
name: pvc-access-pod
8+
labels:
9+
app: pvc-access
10+
spec:
11+
activeDeadlineSeconds: 300 # Auto-delete after 5 minutes
12+
securityContext:
13+
runAsNonRoot: true
14+
runAsUser: 1000
15+
fsGroup: 1000
16+
containers:
17+
- name: ubuntu
18+
image: ubuntu:22.04
19+
command: ["/bin/bash"]
20+
args: ["-c", "sleep 290"] # Sleep for slightly less than deadline - tools can be installed via kubectl exec if needed
21+
securityContext:
22+
allowPrivilegeEscalation: false
23+
readOnlyRootFilesystem: false
24+
capabilities:
25+
drop:
26+
- ALL
27+
volumeMounts:
28+
- name: profiling-storage
29+
mountPath: /profiling_results
30+
resources:
31+
requests:
32+
memory: "128Mi"
33+
cpu: "100m"
34+
limits:
35+
memory: "256Mi"
36+
cpu: "200m"
37+
volumes:
38+
- name: profiling-storage
39+
persistentVolumeClaim:
40+
claimName: profiling-pvc
41+
restartPolicy: Never
Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
#!/usr/bin/env python3
2+
3+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
4+
# SPDX-License-Identifier: Apache-2.0
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
18+
"""
19+
PVC Results Download Script
20+
21+
This script downloads all relevant profiling results from the profiling PVC to a local directory.
22+
It creates the necessary access pod, downloads the files, and cleans up automatically.
23+
24+
Usage:
25+
python3 download_pvc_results.py --namespace <namespace> --output-dir <local_directory> [--no-config]
26+
27+
Examples:
28+
# Download to ./results directory
29+
python3 download_pvc_results.py --namespace <namespace> --output-dir ./results
30+
31+
# Download to specific directory
32+
python3 download_pvc_results.py --namespace <namespace> --output-dir /home/user/profiling_data
33+
34+
# Download without configuration files
35+
python3 download_pvc_results.py --namespace <namespace> --output-dir ./results --no-config
36+
"""
37+
38+
import argparse
39+
import subprocess
40+
import sys
41+
import time
42+
from pathlib import Path
43+
from typing import List
44+
45+
from utils.kubernetes import check_kubectl_access, deploy_access_pod, run_command
46+
47+
48+
def list_pvc_contents(
49+
namespace: str, pod_name: str, skip_config: bool = False
50+
) -> List[str]:
51+
"""List contents of the PVC to identify relevant files."""
52+
print("Scanning PVC contents...")
53+
54+
# Build find command with optional config file exclusion
55+
find_cmd = [
56+
"kubectl",
57+
"exec",
58+
pod_name,
59+
"-n",
60+
namespace,
61+
"--",
62+
"find",
63+
"/profiling_results",
64+
"-type",
65+
"f",
66+
"-name",
67+
"*.png",
68+
"-o",
69+
"-name",
70+
"*.npz",
71+
]
72+
73+
# Add config file patterns if not skipping them
74+
if not skip_config:
75+
find_cmd.extend(
76+
[
77+
"-o",
78+
"-name",
79+
"*.yaml",
80+
"-o",
81+
"-name",
82+
"*.yml",
83+
]
84+
)
85+
86+
try:
87+
result = run_command(find_cmd, capture_output=True)
88+
89+
files = [f.strip() for f in result.stdout.split("\n") if f.strip()]
90+
config_note = " (excluding config files)" if skip_config else ""
91+
print(f"Found {len(files)} relevant files to download{config_note}")
92+
return files
93+
94+
except subprocess.CalledProcessError:
95+
print("ERROR: Failed to list PVC contents")
96+
sys.exit(1)
97+
98+
99+
def download_files(
100+
namespace: str, pod_name: str, files: List[str], output_dir: Path
101+
) -> None:
102+
"""Download relevant files from PVC to local directory."""
103+
if not files:
104+
print("No files to download")
105+
return
106+
107+
# Create output directory
108+
output_dir.mkdir(parents=True, exist_ok=True)
109+
print(f"Downloading {len(files)} files to {output_dir}")
110+
111+
downloaded = 0
112+
failed = 0
113+
114+
for file_path in files:
115+
try:
116+
# Determine relative path and create local structure
117+
rel_path = file_path.replace("/profiling_results/", "")
118+
119+
# Validate relative path
120+
if ".." in rel_path or rel_path.startswith("/"):
121+
print(f" WARNING: Skipping potentially unsafe path: {file_path}")
122+
failed += 1
123+
continue
124+
125+
local_file = output_dir / rel_path
126+
127+
# Ensure the file is within output_dir
128+
if not local_file.resolve().is_relative_to(output_dir.resolve()):
129+
print(f" WARNING: Skipping file outside output directory: {file_path}")
130+
failed += 1
131+
continue
132+
133+
local_file.parent.mkdir(parents=True, exist_ok=True)
134+
135+
# Download file
136+
run_command(
137+
[
138+
"kubectl",
139+
"cp",
140+
f"{namespace}/{pod_name}:{file_path}",
141+
str(local_file),
142+
],
143+
capture_output=True,
144+
)
145+
146+
downloaded += 1
147+
if downloaded % 5 == 0: # Progress update every 5 files
148+
print(f" Downloaded {downloaded}/{len(files)} files...")
149+
150+
except subprocess.CalledProcessError as e:
151+
print(f" WARNING: Failed to download {file_path}: {e}")
152+
failed += 1
153+
154+
print(f"✓ Download completed: {downloaded} successful, {failed} failed")
155+
156+
157+
def download_summary_files(
158+
namespace: str, pod_name: str, output_dir: Path, skip_config: bool = False
159+
) -> None:
160+
"""Download key summary files that might not match the pattern."""
161+
summary_files = [
162+
"/profiling_results/prefill_performance.png",
163+
"/profiling_results/decode_performance.png",
164+
]
165+
166+
# Add config files if not skipping them
167+
if not skip_config:
168+
summary_files.append(
169+
"/profiling_results/disagg.yaml"
170+
) # In case it was injected
171+
172+
print("Downloading summary files...")
173+
174+
for file_path in summary_files:
175+
try:
176+
# Check if file exists first using subprocess.run directly
177+
result = subprocess.run(
178+
[
179+
"kubectl",
180+
"exec",
181+
pod_name,
182+
"-n",
183+
namespace,
184+
"--",
185+
"test",
186+
"-f",
187+
file_path,
188+
],
189+
capture_output=True,
190+
text=True,
191+
check=False,
192+
)
193+
194+
if result.returncode != 0:
195+
# File doesn't exist, skip silently
196+
continue
197+
198+
# File exists, download it
199+
rel_path = file_path.replace("/profiling_results/", "")
200+
201+
# Validate relative path
202+
if ".." in rel_path or rel_path.startswith("/"):
203+
print(
204+
f" ⚠️ Skipped {file_path.split('/')[-1]}: potentially unsafe path"
205+
)
206+
continue
207+
208+
local_file = output_dir / rel_path
209+
210+
# Ensure the file is within output_dir
211+
if not local_file.resolve().is_relative_to(output_dir.resolve()):
212+
print(
213+
f" ⚠️ Skipped {file_path.split('/')[-1]}: outside output directory"
214+
)
215+
continue
216+
217+
local_file.parent.mkdir(parents=True, exist_ok=True)
218+
219+
run_command(
220+
[
221+
"kubectl",
222+
"cp",
223+
f"{namespace}/{pod_name}:{file_path}",
224+
str(local_file),
225+
],
226+
capture_output=True,
227+
)
228+
229+
print(f" ✓ {rel_path}")
230+
231+
except Exception as e:
232+
# File doesn't exist or failed to download, skip silently
233+
print(f" ⚠️ Skipped {file_path.split('/')[-1]}: {e}")
234+
pass
235+
236+
237+
def cleanup_access_pod(namespace: str, pod_name: str) -> None:
238+
"""Clean up the access pod (let it auto-delete via activeDeadlineSeconds)."""
239+
print(f"ℹ️ Access pod '{pod_name}' will auto-delete in 5 minutes")
240+
print(f" To delete immediately: kubectl delete pod {pod_name} -n {namespace}")
241+
242+
243+
def generate_readme(output_dir: Path, file_count: int) -> None:
244+
"""Generate a README file explaining the downloaded contents."""
245+
readme_content = f"""# Profiling Results
246+
247+
Downloaded {file_count} files from profiling PVC.
248+
249+
## File Structure
250+
251+
### Performance Plots
252+
- `prefill_performance.png` - Main prefill performance across TP sizes
253+
- `decode_performance.png` - Main decode performance across TP sizes
254+
255+
### Interpolation Data
256+
- `selected_prefill_interpolation/raw_data.npz` - Prefill performance data
257+
- `selected_prefill_interpolation/*.png` - Prefill interpolation plots
258+
- `selected_decode_interpolation/raw_data.npz` - Decode performance data
259+
- `selected_decode_interpolation/*.png` - Decode interpolation plots
260+
261+
### Configuration Files
262+
- `disagg.yaml` - DynamoGraphDeployment configuration used for profiling
263+
264+
### Individual TP Results
265+
- `prefill_tp*/` - Individual tensor parallelism profiling results
266+
- `decode_tp*/` - Individual tensor parallelism profiling results
267+
268+
## Loading Data
269+
270+
To load the .npz data files in Python:
271+
272+
```python
273+
import numpy as np
274+
275+
# Load prefill data
276+
prefill_data = np.load('selected_prefill_interpolation/raw_data.npz')
277+
print("Prefill data keys:", list(prefill_data.keys()))
278+
279+
# Load decode data
280+
decode_data = np.load('selected_decode_interpolation/raw_data.npz')
281+
print("Decode data keys:", list(decode_data.keys()))
282+
```
283+
284+
Generated on: {time.strftime('%Y-%m-%d %H:%M:%S')}
285+
"""
286+
287+
readme_path = output_dir / "README.md"
288+
with open(readme_path, "w") as f:
289+
f.write(readme_content)
290+
291+
print("📝 Generated README.md with download summary")
292+
293+
294+
def main():
295+
parser = argparse.ArgumentParser(
296+
description="Download profiling results from PVC to local directory",
297+
formatter_class=argparse.RawDescriptionHelpFormatter,
298+
epilog=__doc__,
299+
)
300+
301+
parser.add_argument(
302+
"--namespace",
303+
"-n",
304+
required=True,
305+
help="Kubernetes namespace containing the profiling PVC",
306+
)
307+
308+
parser.add_argument(
309+
"--output-dir",
310+
"-o",
311+
type=Path,
312+
required=True,
313+
help="Local directory to download results to",
314+
)
315+
316+
parser.add_argument(
317+
"--no-config",
318+
action="store_true",
319+
help="Skip downloading configuration files (*.yaml, *.yml)",
320+
)
321+
322+
args = parser.parse_args()
323+
324+
print("📥 PVC Results Download")
325+
print("=" * 40)
326+
327+
# Validate inputs
328+
check_kubectl_access(args.namespace)
329+
330+
# Deploy access pod
331+
pod_name = deploy_access_pod(args.namespace)
332+
333+
# List and download files
334+
files = list_pvc_contents(args.namespace, pod_name, args.no_config)
335+
download_files(args.namespace, pod_name, files, args.output_dir)
336+
337+
# Download additional summary files
338+
download_summary_files(args.namespace, pod_name, args.output_dir, args.no_config)
339+
340+
# Generate README
341+
generate_readme(args.output_dir, len(files))
342+
343+
# Cleanup info
344+
cleanup_access_pod(args.namespace, pod_name)
345+
346+
print("\n✅ Download completed!")
347+
print(f"📁 Results available at: {args.output_dir.absolute()}")
348+
print("📄 See README.md for file descriptions")
349+
350+
351+
if __name__ == "__main__":
352+
main()

0 commit comments

Comments
 (0)