forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_allocate.py
53 lines (44 loc) · 1.58 KB
/
test_allocate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import torch
from torch import Size
import time
import gc
# Allocate a 6GB tensor
# Assuming float16 data type (each element takes 2 bytes)
size_in_gb = 8
element_size = 2 # bytes (for float16)
num_elements = (size_in_gb * 1024**3) // element_size
# Create the tensor
model_weights = torch.empty(num_elements, dtype=torch.float16, device="cuda")
old_block_number = 1889
shape = Size([2,old_block_number,16,128,16])
tensors = []
for i in range(32):
start = time.time()
tensor = torch.zeros(shape, pin_memory=False, device="cuda", dtype=torch.float16).cuda(non_blocking=True)
# torch.cuda.synchronize()
allocate_latency = time.time() - start
tensors.append(tensor)
tensors.clear()
torch.cuda.empty_cache()
free_mem, _ = torch.cuda.mem_get_info()
print(f"Available space: {free_mem/(1024**2):.2f}MB")
allocate_latencys = []
block_number = 5198
shape = Size([2,block_number,16,128,16])
torch.cuda.empty_cache()
gc.collect()
for i in range(32):
start = time.time()
tensor = torch.zeros(shape, pin_memory=False, device="cuda", dtype=torch.float16)
allocate_latency = time.time() - start
allocate_latencys.append(allocate_latency)
# tensors[i].data = tensor
tensors.append(tensor)
torch.cuda.synchronize()
# tensors.append(tensor)
avg_allocate_latency = sum(allocate_latencys) / len(allocate_latencys)
print(f"average allocate latency: {avg_allocate_latency:.3f}s, sum of allocate latencys: {sum(allocate_latencys):.3f}s")
print(allocate_latencys)
torch.cuda.empty_cache()
free_mem, _ = torch.cuda.mem_get_info()
print(f"Available space: {free_mem/(1024**2):.2f}MB")