forked from karpathy/llm.c
-
Notifications
You must be signed in to change notification settings - Fork 6
/
profile_gpt2cu.py
148 lines (126 loc) · 5.19 KB
/
profile_gpt2cu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# runs profiling with ncu, generates a `profile.ncu-rep` for viewing with NSight Compute, and prints out
# basic kernel stats.
# Note: If you run into errors because of missing access rights to performance counters, try
# https://developer.nvidia.com/nvidia-development-tools-solutions-err_nvgpuctrperm-permission-issue-performance-counters#SolnAdminTag
import subprocess
import csv
from collections import defaultdict
import shutil
# find ncu: Is it on PATH?
NCU = shutil.which("ncu")
# otherwise, guess a standard location
if NCU is None:
NCU = "/usr/local/cuda/bin/ncu"
# build the exe
subprocess.check_call(["make", "profile_gpt2cu"])
# record metrics
# --full and --import-source are entirely superfluous for this script, but you might want to
# manually inspect `profile.ncu-rep`, so we keep it here
cmd = [NCU, "--set", "full", "--import-source", "yes", "-o", "profile", "-f", "./profile_gpt2cu"]
subprocess.check_call(cmd)
# generate csv
# https://forums.developer.nvidia.com/t/converting-nsys-rep-file-into-a-csv-file-with-formatting-like-the-summary-page-in-ncu-gui/231717/3
metrics = [
"gpu__time_duration.sum", # total time
"dram__bytes_read.sum", # DRAM reads
"dram__bytes_write.sum", # DRAM writes
"lts__t_sectors_srcunit_tex_op_read.sum", # L2 reads (sectors -- 32B)
"lts__t_sectors_srcunit_tex_op_write.sum", # L2 reads (sectors -- 32B)
"smsp__inst_executed.sum", # instructions
]
cmd = [NCU, "-i", "profile.ncu-rep", "--csv", "--page", "raw", "--metrics", ",".join(metrics)]
result = subprocess.check_output(cmd, text=True).strip()
reader = csv.reader(result.splitlines(keepends=True))
# model config
CLS_START = 15
CLS_NUM = 6
ADAM_ID = 44
N_LAYERS = 12
summaries = defaultdict(lambda: 0.0)
passes = defaultdict(lambda: 0.0)
total = defaultdict(lambda: 0.0)
no_cutlass = 0.0
CC = ""
print()
print("Kernel calls:")
for rid, row in enumerate(reader):
if rid == 0:
# headings
print(f"id pass {'name':<40} {'time':>8} {'RAM rd':>8} {'RAM wt':>8} {'L2 rd':>8} {'L2 wt':>8} {'inst':>8}")
continue
if rid == 1:
# units
units = f" {'':<40} {'ms':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'GiB':>8} {'MInst':>8}"
print(units)
print("." * len(units))
continue
if rid == 2:
CC = row[10]
# actual data
kernel = row[4]
time = float(row[13])
read = float(row[11])
write = float(row[12])
l2_read = float(row[14])
l2_write = float(row[15])
inst = float(row[16]) / 1e6
kid = rid - 2
if kid == 0 or kid == ADAM_ID - 1:
pass_name = "enc"
elif CLS_START <= kid < CLS_START + CLS_NUM:
# the classifier part, counts only once
pass_name = "cls"
elif kid == ADAM_ID:
# encoder layer or adam
pass_name = "opt"
else:
pass_name = "fwd" if kid < CLS_START else "bwd"
time *= N_LAYERS
read *= N_LAYERS
write *= N_LAYERS
l2_read *= N_LAYERS
l2_write *= N_LAYERS
# split at "(" -- argument list
fn_name = kernel.split("(")[0]
# some names include the return value, others don't?
if " " in fn_name:
fn_name = fn_name.split(" ")[1]
if "cutlass" in fn_name:
fn_name = fn_name.split("<")[0]
pass
else:
no_cutlass += time
# convert L2 to GiB
l2_read = l2_read * 32 / 1024 / 1024 / 1024
l2_write = l2_write * 32 / 1024 / 1024 / 1024
summaries[fn_name] += time
passes[pass_name] += time
total['time'] += time
total['read'] += read
total['write'] += write
total['l2_read'] += l2_read
total['l2_write'] += l2_write
total['inst'] += inst
print(f"{kid:02} {pass_name:4} {fn_name:<40} {time:8.2f} {read:8.2f} {write:8.2f} {l2_read:8.2f} {l2_write:8.2f} {inst:8.2f}")
total_time = total['time']
print("." * len(units))
print(f" {'Total':<40} {total['time']:8.2f} {total['read']:8.2f} {total['write']:8.2f} {total['l2_read']:8.2f} {total['l2_write']:8.2f} {total['inst']:8.2f}")
print()
print("Kernel type summaries:")
print(f" {'name':<40} {'time':>6} {'frac':>6}")
ordered = sorted(summaries.items(), key=lambda x: x[1], reverse=True)
for entry, value in ordered:
print(f" {entry:<40} {value:6.2f} {100*value / total_time:6.2f}%")
ts = total_time / 1000
summary = f"""
In total, a training step takes {total_time:.1f}ms, distributed as:
{passes['enc']:.1f}ms ({100 * passes['enc'] / total_time:.1f}%) in the encoder,
{passes['fwd']:.1f}ms ({100 * passes['fwd'] / total_time:.1f}%) in forward blocks,
{passes['cls']:.1f}ms ({100 * passes['cls'] / total_time:.1f}%) in the classifier part,
{passes['bwd']:.1f}ms ({100 * passes['bwd'] / total_time:.1f}%) in backward blocks, and
{passes['opt']:.1f}ms ({100 * passes['opt'] / total_time:.1f}%) in the optimizer.
We read {total['read']:.1f}GiB ({total['read']/ts:.1f}GB/s) and write {total['write']:.1f}GiB ({total['write']/ts:.1f}GB/s) to DRAM,
read {total['l2_read']:.1f}GiB ({total['l2_read']/ts:.1f}GB/s) and write {total['l2_write']:.1f}GiB ({total['l2_write']/ts:.1f}GB/s) to L2,
and execute {total['inst'] / 1000:.1f} billion instructions ({total['inst'] / 1000 / ts:.1f} GInst/s).
"""
print(summary)