@@ -32,7 +32,7 @@ This tutorial will show you exactly how to replicate those speedups so you can b
32
32
For GPU (newer generation GPUs will see drastically better performance)
33
33
34
34
```
35
- pip3 install numpy --pre torch[dynamo] --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
35
+ pip3 install numpy --pre torch --force-reinstall --extra-index-url https://download.pytorch.org/whl/nightly/cu117
36
36
37
37
```
38
38
@@ -78,16 +78,16 @@ by step. Please note that you’re likely to see more significant speedups the n
78
78
79
79
``` python
80
80
import torch
81
- def fn (x , y ):
82
- a = torch.sin(x).cuda()
83
- b = torch.sin(y).cuda()
84
- return a + b
85
- new_fn = torch.compile(fn, backend = " inductor" )
86
- input_tensor = torch.randn(10000 ).to(device = " cuda:0" )
87
- a = new_fn()
81
+ def fn (x , y ):
82
+ a = torch.sin(x).cuda()
83
+ b = torch.sin(y).cuda()
84
+ return a + b
85
+ new_fn = torch.compile(fn, backend = " inductor" )
86
+ input_tensor = torch.randn(10000 ).to(device = " cuda:0" )
87
+ a = new_fn()
88
88
```
89
89
90
- This example won’t actually run faster but it’s a good educational.
90
+ This example won’t actually run faster but it’s educational.
91
91
92
92
example that features ` torch.cos() ` and ` torch.sin() ` which are examples of pointwise ops as in they operate element by element on a vector. A more famous pointwise op you might actually want to use would be something like ` torch.relu() ` .
93
93
@@ -110,17 +110,17 @@ TORCHINDUCTOR_TRACE=1 python trig.py
110
110
``` python
111
111
112
112
@pointwise (size_hints = [16384 ], filename = __file__ , meta = {' signature' : {0 : ' *fp32' , 1 : ' *fp32' , 2 : ' i32' }, ' device' : 0 , ' constants' : {}, ' configs' : [instance_descriptor(divisible_by_16 = (0 , 1 , 2 ), equal_to_1 = ())]})
113
- @triton.jit
114
- def kernel (in_ptr0 , out_ptr0 , xnumel , XBLOCK : tl.constexpr):
115
- xnumel = 10000
116
- xoffset = tl.program_id(0 ) * XBLOCK
117
- xindex = xoffset + tl.reshape(tl.arange(0 , XBLOCK ), [XBLOCK ])
118
- xmask = xindex < xnumel
119
- x0 = xindex
120
- tmp0 = tl.load(in_ptr0 + (x0), xmask)
121
- tmp1 = tl.sin(tmp0)
122
- tmp2 = tl.sin(tmp1)
123
- tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK ], tl.int32)), tmp2, xmask)
113
+ @triton.jit
114
+ def kernel (in_ptr0 , out_ptr0 , xnumel , XBLOCK : tl.constexpr):
115
+ xnumel = 10000
116
+ xoffset = tl.program_id(0 ) * XBLOCK
117
+ xindex = xoffset + tl.reshape(tl.arange(0 , XBLOCK ), [XBLOCK ])
118
+ xmask = xindex < xnumel
119
+ x0 = xindex
120
+ tmp0 = tl.load(in_ptr0 + (x0), xmask)
121
+ tmp1 = tl.sin(tmp0)
122
+ tmp2 = tl.sin(tmp1)
123
+ tl.store(out_ptr0 + (x0 + tl.zeros([XBLOCK ], tl.int32)), tmp2, xmask)
124
124
125
125
```
126
126
@@ -132,9 +132,9 @@ As a next step let’s try a real model like resnet50 from the PyTorch hub.
132
132
133
133
``` python
134
134
import torch
135
- model = torch.hub.load(' pytorch/vision:v0.10.0' , ' resnet18' , pretrained = True )
136
- opt_model = torch.compile(model, backend = " inductor" )
137
- model(torch.randn(1 ,3 ,64 ,64 ))
135
+ model = torch.hub.load(' pytorch/vision:v0.10.0' , ' resnet18' , pretrained = True )
136
+ opt_model = torch.compile(model, backend = " inductor" )
137
+ model(torch.randn(1 ,3 ,64 ,64 ))
138
138
139
139
```
140
140
@@ -152,14 +152,14 @@ So we’re going to directly download a pretrained model from the Hugging Face h
152
152
``` python
153
153
154
154
import torch
155
- from transformers import BertTokenizer, BertModel
156
- # Copy pasted from here https://huggingface.co/bert-base-uncased
157
- tokenizer = BertTokenizer.from_pretrained(' bert-base-uncased' )
158
- model = BertModel.from_pretrained(" bert-base-uncased" ).to(device = " cuda:0" )
159
- model = torch.compile(model) # This is the only line of code that we changed
160
- text = " Replace me by any text you'd like."
161
- encoded_input = tokenizer(text, return_tensors = ' pt' ).to(device = " cuda:0" )
162
- output = model(** encoded_input)
155
+ from transformers import BertTokenizer, BertModel
156
+ # Copy pasted from here https://huggingface.co/bert-base-uncased
157
+ tokenizer = BertTokenizer.from_pretrained(' bert-base-uncased' )
158
+ model = BertModel.from_pretrained(" bert-base-uncased" ).to(device = " cuda:0" )
159
+ model = torch.compile(model) # This is the only line of code that we changed
160
+ text = " Replace me by any text you'd like."
161
+ encoded_input = tokenizer(text, return_tensors = ' pt' ).to(device = " cuda:0" )
162
+ output = model(** encoded_input)
163
163
164
164
```
165
165
@@ -171,10 +171,10 @@ Similarly let’s try out a TIMM example
171
171
172
172
``` python
173
173
import timm
174
- import torch
175
- model = timm.create_model(' resnext101_32x8d' , pretrained = True , num_classes = 2 )
176
- opt_model = torch.compile(model, backend = " inductor" )
177
- opt_model(torch.randn(64 ,3 ,7 ,7 ))
174
+ import torch
175
+ model = timm.create_model(' resnext101_32x8d' , pretrained = True , num_classes = 2 )
176
+ opt_model = torch.compile(model, backend = " inductor" )
177
+ opt_model(torch.randn(64 ,3 ,7 ,7 ))
178
178
```
179
179
180
180
Our goal with PyTorch was to build a breadth-first compiler that would speed up the vast majority of actual models people run in open source. The Hugging Face Hub ended up being an extremely valuable benchmarking tool for us, ensuring that any optimization we work on actually helps accelerate models people want to run.
0 commit comments