1
- import argparse
2
1
import csv
3
2
import gc
4
3
import json
5
- import os
6
- import random
7
- import statistics
8
- import time
9
4
10
- import numpy as np
11
5
import torch
12
- import torchvision .models as models
13
- from sklearn .neighbors import KDTree
14
- from torch .profiler import ProfilerActivity , profile , record_function
15
6
from transformers import (
16
- AutoConfig ,
17
7
AutoModelForCausalLM ,
18
8
AutoTokenizer ,
19
- BertConfig ,
20
- BertForMaskedLM ,
21
- GPT2ForSequenceClassification ,
22
- PegasusConfig ,
23
- PegasusForCausalLM ,
9
+ AutoModelForImageClassification ,
10
+ AutoModelForObjectDetection
24
11
)
25
12
13
+
14
+
26
15
from centml .compiler .prediction .kdtree import KDTreeWithValues
27
16
from centml .compiler .prediction .profiler import Profiler
17
+ from scripts .timer import timed
28
18
29
19
torch .set_float32_matmul_precision ('high' )
30
20
torch .set_default_device ('cuda' )
34
24
OUTPUT_FILE = 'data.csv'
35
25
36
26
# Different HuggingFace Models + Different Input Sizes
37
- hf_model_tests = [
38
- ("EleutherAI/gpt-neo-2.7B" , (1 , 512 )),
27
+ llm_tests = [
28
+ ("google/gemma-7b" , (1 , 128 )),
29
+ ("microsoft/phi-2" , (1 ,512 )),
30
+ ("microsoft/phi-2" , (2 ,512 )),
31
+ ("facebook/bart-large" , (1 , 1024 )),
32
+ ("facebook/bart-large" , (2 , 512 )),
39
33
("gpt2-xl" , (1 , 1024 )),
40
- ("gpt2-large " , (1 , 1024 )),
34
+ ("gpt2-xl " , (1 , 720 )),
41
35
("gpt2-xl" , (1 , 512 )),
36
+ ("gpt2-xl" , (2 , 512 )),
37
+ ("gpt2-xl" , (4 , 256 )),
38
+ ("EleutherAI/gpt-neo-2.7B" , (1 , 512 )),
39
+ ("EleutherAI/gpt-neo-2.7B" , (1 , 256 )),
40
+ ("gpt2-large" , (1 , 1024 )),
41
+ ("gpt2-large" , (1 , 720 )),
42
+ ("gpt2-large" , (1 , 512 )),
42
43
("google-bert/bert-large-uncased" , (8 , 512 )),
43
44
("google-bert/bert-large-uncased" , (16 , 512 )),
44
- ("meta-llama/Meta-Llama-3.1-8B" , (1 , 512 )),
45
45
("meta-llama/Meta-Llama-3.1-8B" , (1 , 256 )),
46
46
("gpt2-medium" , (1 , 1024 )),
47
- ("facebook/bart-large" , (1 , 1024 )),
47
+ ("gpt2-medium" , (1 , 512 )),
48
+ ("gpt2-medium" , (2 , 512 )),
48
49
("google/pegasus-cnn_dailymail" , (1 , 1024 )),
50
+ ("google/pegasus-cnn_dailymail" , (1 , 512 )),
51
+ ("google/pegasus-cnn_dailymail" , (2 , 512 )),
49
52
]
50
53
51
- # Different Batch Sizes for each ResNet Model (torchvision)
52
- resnet_tests = [1024 , 720 , 1440 ]
53
-
54
+ # Tests for larger GPUs (A100, H100, etc.)
55
+ # large_llm_tests = [
56
+ # ("google/gemma-7b", (1, 256)),
57
+ # ("google/gemma-7b", (1, 512)),
58
+ # ("google/gemma-7b", (1, 1024)),
59
+ # ("microsoft/phi-2", (1,1024)),
60
+ # ("microsoft/phi-2", (1,2048)),
61
+ # ("microsoft/phi-2", (2,1024)),
62
+ # ("EleutherAI/gpt-neo-2.7B", (1, 1024)),
63
+ # ("gpt2-xl", (2, 1024)),
64
+ # ("gpt2-xl", (4, 512)),
65
+ # ("meta-llama/Meta-Llama-3.1-8B", (1, 1024)),
66
+ # ("meta-llama/Meta-Llama-3.1-8B", (1, 512)),
67
+ # ("google/pegasus-cnn_dailymail", (4, 1024)),
68
+ # ("facebook/bart-large", (4, 1024)),
69
+ # ("facebook/bart-large", (2, 1024)),
70
+ # ("google-bert/bert-large-uncased", (16, 512)),
71
+ # ("gpt2-medium", (2, 1024)),
72
+ # ("gpt2-medium", (4, 512)),
73
+ # ("gpt2-large", (2, 1024)),
74
+ # ("gpt2-large", (4, 512)),
75
+ # ]
76
+
77
+ # Different Batch Sizes for each image classification model
78
+ image_classification_tests = [
79
+ ("google/efficientnet-b0" , 512 ),
80
+ ("google/efficientnet-b0" , 256 ),
81
+ ("google/efficientnet-b0" , 128 ),
82
+ ("google/vit-base-patch16-224" , 128 ),
83
+ ("microsoft/resnet-50" , 256 ),
84
+ ("microsoft/resnet-50" , 512 ),
85
+ ]
54
86
55
- def timed ( fn ):
56
- start = torch . cuda . Event ( enable_timing = True )
57
- end = torch . cuda . Event ( enable_timing = True )
58
- start . record ()
59
- result = fn ()
60
- end . record ()
61
- torch . cuda . synchronize ()
62
- return result , start . elapsed_time ( end ) / 1000
87
+ # Different Batch Sizes for each object detection model
88
+ object_detection_tests = [
89
+ ( "hustvl/yolos-tiny" , 128 ),
90
+ ( "hustvl/yolos-tiny" , 256 ),
91
+ ( "hustvl/yolos-tiny" , 512 ),
92
+ ( "facebook/detr-resnet-50" , 128 ),
93
+ ( "facebook/detr-resnet-50" , 256 ),
94
+ ]
63
95
64
96
65
97
def percent_error (observed , true ):
@@ -90,24 +122,28 @@ def get(self, key, inp):
90
122
91
123
92
124
db = DataCollectionTreeDB ()
93
- added_time = 0
125
+ cuda_kernel_time = 0
126
+ actual_time = 0
94
127
95
128
96
129
def custom_backend (gm : torch .fx .GraphModule , inps ):
97
130
print ("Compiling" )
98
131
profiler = Profiler (mod = gm , gpu = CURR_GPU , treeDB = db , data_collection_mode = True )
99
132
100
133
def forward (* args ):
101
- global added_time
102
- out , t = profiler .propagate (* args )
103
- added_time += t
134
+ global cuda_kernel_time
135
+ global actual_time
136
+ out , t , actual_t = profiler .propagate (* args )
137
+ cuda_kernel_time += t
138
+ actual_time += actual_t
104
139
return out
105
140
106
141
return forward
107
142
108
143
109
- def hf_model_test (model_name , input_size , custom_backend ):
110
- global added_time
144
+ def llm_test (model_name , input_size , custom_backend ):
145
+ global cuda_kernel_time
146
+ global actual_time
111
147
models_without_tokenizer = {"google/pegasus-cnn_dailymail" }
112
148
113
149
model = AutoModelForCausalLM .from_pretrained (model_name ).to ("cuda:0" )
@@ -131,22 +167,55 @@ def hf_model_test(model_name, input_size, custom_backend):
131
167
compiled_model = torch .compile (model , backend = custom_backend )
132
168
compiled_model (inp )
133
169
134
- added_time /= 1000000
170
+ cuda_kernel_time /= 1000000
135
171
136
172
print (f"{ model_name } , { input_size } " )
137
- print ("Real time: " , t )
138
- print ("TOTAL TIME : " , added_time )
139
- print ("Error: " , percent_error (added_time , t ))
173
+ print ("Real time: " , actual_time )
174
+ print ("Kernel execution time : " , cuda_kernel_time )
175
+ print ("Error: " , percent_error (cuda_kernel_time , actual_time ))
140
176
141
- added_time = 0
177
+ cuda_kernel_time = 0
178
+ actual_time = 0
142
179
del model , inp , compiled_model
143
180
gc .collect ()
144
181
torch .cuda .empty_cache ()
145
182
146
183
147
- def resnet_test (batch_size , custom_backend ):
148
- global added_time
149
- model = models .resnet50 (weights = True , num_classes = 1000 ).cuda ()
184
+ def image_classification_test (model_name , batch_size , custom_backend ):
185
+ global cuda_kernel_time
186
+ global actual_time
187
+ model = AutoModelForImageClassification .from_pretrained (model_name ).to ("cuda:0" )
188
+ model .eval ()
189
+ if model_name == "google/vit-base-patch16-224" :
190
+ inp = torch .randn (batch_size , 3 , 224 , 224 ).cuda (0 )
191
+ else :
192
+ inp = torch .randn (batch_size , 3 , 128 , 128 ).cuda (0 )
193
+
194
+ with torch .inference_mode ():
195
+ for _ in range (10 ):
196
+ _ , t = timed (lambda : model (inp ))
197
+ print (t )
198
+
199
+ compiled_model = torch .compile (model , backend = custom_backend )
200
+ compiled_model (inp )
201
+
202
+ cuda_kernel_time /= 1000000
203
+
204
+ print (f"{ model_name } , { batch_size } " )
205
+ print ("Real time: " , actual_time )
206
+ print ("TOTAL TIME: " , cuda_kernel_time )
207
+ print ("Error: " , percent_error (cuda_kernel_time , actual_time ))
208
+
209
+ cuda_kernel_time = 0
210
+ actual_time = 0
211
+ del model , inp , compiled_model
212
+ gc .collect ()
213
+ torch .cuda .empty_cache ()
214
+
215
+ def object_detection_test (model_name , batch_size , custom_backend ):
216
+ global cuda_kernel_time
217
+ global actual_time
218
+ model = AutoModelForObjectDetection .from_pretrained (model_name ).to ("cuda:0" )
150
219
model .eval ()
151
220
inp = torch .randn (batch_size , 3 , 128 , 128 ).cuda (0 )
152
221
@@ -157,22 +226,31 @@ def resnet_test(batch_size, custom_backend):
157
226
158
227
compiled_model = torch .compile (model , backend = custom_backend )
159
228
compiled_model (inp )
160
- print (f"resnet, ({ batch_size } , 3, 128, 128)" )
161
- print ("Real time: " , t )
162
- print ("TOTAL TIME: " , added_time )
163
- print ("Error: " , percent_error (added_time , t ))
164
229
165
- added_time = 0
230
+ cuda_kernel_time /= 1000000
231
+
232
+ print (f"{ model_name } , { batch_size } " )
233
+ print ("Real time: " , actual_time )
234
+ print ("TOTAL TIME: " , cuda_kernel_time )
235
+ print ("Error: " , percent_error (cuda_kernel_time , actual_time ))
236
+
237
+ cuda_kernel_time = 0
238
+ actual_time = 0
166
239
del model , inp , compiled_model
167
240
gc .collect ()
168
241
torch .cuda .empty_cache ()
169
242
243
+ # for model_name, input_size in large_llm_tests:
244
+ # llm_test(model_name, input_size, custom_backend)
245
+
246
+ for model_name , input_size in llm_tests :
247
+ llm_test (model_name , input_size , custom_backend )
170
248
171
- for model_name , input_size in hf_model_tests :
172
- hf_model_test (model_name , input_size , custom_backend )
249
+ for model_name , batch_size in object_detection_tests :
250
+ object_detection_test (model_name , batch_size , custom_backend )
173
251
174
- for batch_size in resnet_tests :
175
- resnet_test ( batch_size , custom_backend )
252
+ for model_name , batch_size in image_classification_tests :
253
+ image_classification_test ( model_name , batch_size , custom_backend )
176
254
177
255
# Write to CSV
178
256
with open (OUTPUT_FILE , 'w' , newline = '' ) as csvfile :
0 commit comments