@@ -32,12 +32,14 @@ Greedy search works well for tasks with relatively short outputs where creativit
3232
3333``` py
3434import torch
35- from transformers import AutoModelForCausalLM, AutoTokenizer
35+ from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
36+
37+ device = infer_device()
3638
3739tokenizer = AutoTokenizer.from_pretrained(" meta-llama/Llama-2-7b-hf" )
38- inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(" cuda " )
40+ inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(device )
3941
40- model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(" cuda " )
42+ model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(device )
4143# explicitly set to default length because Llama2 generation length is 4096
4244outputs = model.generate(** inputs, max_new_tokens = 20 )
4345tokenizer.batch_decode(outputs, skip_special_tokens = True )
@@ -52,12 +54,14 @@ Enable multinomial sampling with `do_sample=True` and `num_beams=1`.
5254
5355``` py
5456import torch
55- from transformers import AutoModelForCausalLM, AutoTokenizer
57+ from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
58+
59+ device = infer_device()
5660
5761tokenizer = AutoTokenizer.from_pretrained(" meta-llama/Llama-2-7b-hf" )
58- inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(" cuda " )
62+ inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(device )
5963
60- model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(" cuda " )
64+ model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(device )
6165# explicitly set to 100 because Llama2 generation length is 4096
6266outputs = model.generate(** inputs, max_new_tokens = 50 , do_sample = True , num_beams = 1 )
6367tokenizer.batch_decode(outputs, skip_special_tokens = True )
@@ -75,12 +79,14 @@ Enable beam search with the `num_beams` parameter (should be greater than 1 othe
7579
7680``` py
7781import torch
78- from transformers import AutoModelForCausalLM, AutoTokenizer
82+ from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
83+
84+ device = infer_device()
7985
8086tokenizer = AutoTokenizer.from_pretrained(" meta-llama/Llama-2-7b-hf" )
81- inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(" cuda " )
87+ inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(device )
8288
83- model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(" cuda " )
89+ model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(device )
8490# explicitly set to 100 because Llama2 generation length is 4096
8591outputs = model.generate(** inputs, max_new_tokens = 50 , num_beams = 2 )
8692tokenizer.batch_decode(outputs, skip_special_tokens = True )
@@ -160,12 +166,14 @@ Enable prompt lookup decoding with the `prompt_lookup_num_tokens` parameter.
160166
161167``` py
162168import torch
163- from transformers import AutoModelForCausalLM, AutoTokenizer
169+ from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
170+
171+ device = infer_device()
164172
165173tokenizer = AutoTokenizer.from_pretrained(" HuggingFaceTB/SmolLM-1.7B" )
166- model = AutoModelForCausalLM.from_pretrained(" HuggingFaceTB/SmolLM-1.7B" , torch_dtype = torch.float16).to(" cuda " )
167- assistant_model = AutoModelForCausalLM.from_pretrained(" HuggingFaceTB/SmolLM-135M" , torch_dtype = torch.float16).to(" cuda " )
168- inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(" cuda " )
174+ model = AutoModelForCausalLM.from_pretrained(" HuggingFaceTB/SmolLM-1.7B" , torch_dtype = torch.float16).to(device )
175+ assistant_model = AutoModelForCausalLM.from_pretrained(" HuggingFaceTB/SmolLM-135M" , torch_dtype = torch.float16).to(device )
176+ inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(device )
169177
170178outputs = model.generate(** inputs, assistant_model = assistant_model, max_new_tokens = 20 , prompt_lookup_num_tokens = 5 )
171179tokenizer.batch_decode(outputs, skip_special_tokens = True )
@@ -226,12 +234,14 @@ Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `
226234
227235``` py
228236import torch
229- from transformers import AutoModelForCausalLM, AutoTokenizer
237+ from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
238+
239+ device = infer_device()
230240
231241tokenizer = AutoTokenizer.from_pretrained(" meta-llama/Llama-2-7b-hf" )
232- inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(" cuda " )
242+ inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(device )
233243
234- model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(" cuda " )
244+ model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(device )
235245# explicitly set to 100 because Llama2 generation length is 4096
236246outputs = model.generate(** inputs, max_new_tokens = 100 , penalty_alpha = 0.6 , top_k = 4 )
237247tokenizer.batch_decode(outputs, skip_special_tokens = True )
@@ -262,11 +272,13 @@ Enable DoLa with the following parameters.
262272
263273``` py
264274import torch
265- from transformers import AutoModelForCausalLM, AutoTokenizer
275+ from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
276+
277+ device = infer_device()
266278
267279tokenizer = AutoTokenizer.from_pretrained(" HuggingFaceTB/SmolLM-1.7B" )
268- model = AutoModelForCausalLM.from_pretrained(" HuggingFaceTB/SmolLM-1.7B" , torch_dtype = torch.float16).to(" cuda " )
269- inputs = tokenizer(" What is the highest peak in the world??" , return_tensors = " pt" ).to(" cuda " )
280+ model = AutoModelForCausalLM.from_pretrained(" HuggingFaceTB/SmolLM-1.7B" , torch_dtype = torch.float16).to(device )
281+ inputs = tokenizer(" What is the highest peak in the world??" , return_tensors = " pt" ).to(device )
270282
271283outputs = model.generate(** inputs, max_new_tokens = 50 , dola_layers = " high" , do_sample = False )
272284tokenizer.batch_decode(outputs, skip_special_tokens = True )
@@ -280,11 +292,13 @@ Contrast layers 18 and 20 with the final layer.
280292
281293``` py
282294import torch
283- from transformers import AutoModelForCausalLM, AutoTokenizer
295+ from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
296+
297+ device = infer_device()
284298
285299tokenizer = AutoTokenizer.from_pretrained(" HuggingFaceTB/SmolLM-1.7B" )
286- model = AutoModelForCausalLM.from_pretrained(" HuggingFaceTB/SmolLM-1.7B" , torch_dtype = torch.float16).to(" cuda " )
287- inputs = tokenizer(" What is the highest peak in the world?" , return_tensors = " pt" ).to(" cuda " )
300+ model = AutoModelForCausalLM.from_pretrained(" HuggingFaceTB/SmolLM-1.7B" , torch_dtype = torch.float16).to(device )
301+ inputs = tokenizer(" What is the highest peak in the world?" , return_tensors = " pt" ).to(device )
288302
289303outputs = model.generate(** inputs, max_new_tokens = 50 , dola_layers = [18 ,20 ], do_sample = False , repetition_penalty = 1.2 )
290304tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[- 1 ]:], skip_special_tokens = True )
@@ -302,12 +316,14 @@ Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversit
302316
303317``` py
304318import torch
305- from transformers import AutoModelForCausalLM, AutoTokenizer
319+ from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
320+
321+ device = infer_device()
306322
307323tokenizer = AutoTokenizer.from_pretrained(" meta-llama/Llama-2-7b-hf" )
308- inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(" cuda " )
324+ inputs = tokenizer(" Hugging Face is an open-source company" , return_tensors = " pt" ).to(device )
309325
310- model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(" cuda " )
326+ model = AutoModelForCausalLM.from_pretrained(" meta-llama/Llama-2-7b-hf" , torch_dtype = torch.float16).to(device )
311327# explicitly set to 100 because Llama2 generation length is 4096
312328outputs = model.generate(** inputs, max_new_tokens = 50 , num_beams = 6 , num_beam_groups = 3 , diversity_penalty = 1.0 , do_sample = False )
313329tokenizer.batch_decode(outputs, skip_special_tokens = True )
0 commit comments