huggingface
diff --git a/‎docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile‎
Lines changed: 5 additions & 5 deletions b/‎docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎docs/source/en/agents_advanced.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/agents_advanced.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/chat_templating.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/chat_templating.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/generation_strategies.md‎
Lines changed: 10 additions & 11 deletions b/‎docs/source/en/generation_strategies.md‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎docs/source/en/index.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/en/index.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/en/kv_cache.md‎
Lines changed: 35 additions & 33 deletions b/‎docs/source/en/kv_cache.md‎
Lines changed: 35 additions & 33 deletions
diff --git a/‎docs/source/en/model_doc/glm.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/model_doc/glm.md‎
Lines changed: 1 addition & 1 deletion
@@ -1,11 +1,11 @@
-FROM rocm/dev-ubuntu-22.04:5.6
+FROM rocm/dev-ubuntu-22.04:6.3
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
-ARG PYTORCH='2.1.1'
-ARG TORCH_VISION='0.16.1'
-ARG TORCH_AUDIO='2.1.1'
-ARG ROCM='5.6'
+ARG PYTORCH='2.5.1'
+ARG TORCH_VISION='0.20.0'
+ARG TORCH_AUDIO='2.5.0'
+ARG ROCM='6.3'
 
 RUN apt update && \
     apt install -y --no-install-recommends \
 
@@ -162,7 +162,7 @@ agent.run(
 improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background"
 
 Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt.
->>> Agent is executing the code below:
+=== Agent is executing the code below:
 image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background")
 final_answer(image)
 ```
 
@@ -39,7 +39,7 @@ Let's make this concrete with a quick example using the `mistralai/Mistral-7B-In
 ... ]
 
 >>> tokenizer.apply_chat_template(chat, tokenize=False)
-"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"
+"<s> [INST] Hello, how are you? [/INST] I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"
 ```
 
 Notice how the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of
 
@@ -231,7 +231,7 @@ to check if the text is machine-generated (outputs `True` for machine-generated
 >>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config)
 >>> detection_out = detector(out, return_dict=True)
 >>> detection_out.prediction
-array([True, True])
+array([ True,  True])
 ```
 
 
@@ -269,7 +269,7 @@ dimension you can act upon, in addition to selecting a decoding strategy. Popula
 >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
 >>> outputs = model.generate(**inputs)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n']
+['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
 ```
 
 ### Contrastive search
@@ -445,7 +445,7 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
 >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a glass of wine.']
 ```
 
 <Tip>
@@ -461,7 +461,7 @@ If you're using a `pipeline` object, all you need to do is to pass the assistant
 ...     model="meta-llama/Llama-3.1-8B",
 ...     assistant_model="meta-llama/Llama-3.2-1B",  # This extra line is all that's needed, also works with UAD
 ...     torch_dtype=torch.bfloat16
->>> )
+... )
 >>> pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False)
 >>> pipe_output[0]["generated_text"]
 'Once upon a time, 3D printing was a niche technology that was only'
@@ -488,7 +488,7 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
 >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob, a couple of friends of mine, who are both in the same office as']
+['Alice and Bob are two people who are very different, but they are both very good at what they do. Alice']
 ```
 
 We recommend to install `scikit-learn` library to enhance the candidate generation strategy and achieve additional speedup.
@@ -518,7 +518,7 @@ to ensure the new tokens include the correct prompt suffix.
 >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
+['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a']
 ```
 
 #### Prompt Lookup
@@ -547,7 +547,7 @@ If the model you're using was trained to do early exit, you can pass
 >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
 >>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
+['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a']
 ```
 
 ### DoLa Decoding
@@ -571,10 +571,9 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.
 >>> import torch
 >>> from accelerate.test_utils.testing import get_backend
 
->>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
->>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16)
 >>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
->>> model.to(device)
+>>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+>>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16).to(device)
 >>> set_seed(42)
 
 >>> text = "On what date was the Declaration of Independence officially signed?"
@@ -593,7 +592,7 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.
 # DoLa decoding with contrasting specific layers (layers 28 and 30)
 >>> dola_custom_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers=[28,30], repetition_penalty=1.2)
 >>> tokenizer.batch_decode(dola_custom_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
-['\nIt was officially signed on 2 August 1776, when 56 members of the Second Continental Congress, representing the original 13 American colonies, voted unanimously for the resolution for independence. The 2']
+['\nIn 1891, when he was 54 years old, John Jacob Astor founded his empire. He opened a one-man business and spent the next 27 years working 10-hour days. When']
 ```
 
 #### Understanding the `dola_layers` argument
 
@@ -385,6 +385,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                         [YOLOS](model_doc/yolos)                         |       ✅        |         ❌         |      ❌      |
 |                          [YOSO](model_doc/yoso)                          |       ✅        |         ❌         |      ❌      |
 |                         [Zamba](model_doc/zamba)                         |       ✅        |         ❌         |      ❌      |
+|                        [Zamba2](model_doc/zamba2)                        |       ✅        |         ❌         |      ❌      |
 |                      [ZoeDepth](model_doc/zoedepth)                      |       ✅        |         ❌         |      ❌      |
 
 <!-- End table-->
@@ -56,7 +56,7 @@ More concretely, key-value cache acts as a memory bank for these generative mode
   >>> import torch
   >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
 
-  >>> model_id = "meta-llama/Llama-2-7b-chat-hf"
+  >>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
   >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
   >>> tokenizer = AutoTokenizer.from_pretrained(model_id)
 
@@ -82,7 +82,13 @@ More concretely, key-value cache acts as a memory bank for these generative mode
   ...     cache_position = cache_position[-1:] + 1 # add one more position for the next token
 
   >>> print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
-  "[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
+  ```
+  ```txt
+  <|user|>
+  Hello, what's your name. 
+  <|assistant|>
+  My name is Sarah. 
+  <|
   ```
 
 </details>
@@ -132,17 +138,13 @@ Cache quantization can be detrimental in terms of latency if the context length
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16).to("cuda:0")
 >>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
 
 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
 >>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-I like rock music because it's loud and energetic. It's a great way to express myself and rel
-
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20)
->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
+I like rock music because it's a great way to express myself. I like the way it makes me feel, the
 ```
 
 ### Offloaded Cache
@@ -231,14 +233,14 @@ For more examples with Static Cache and JIT compilation, take a look at [StaticC
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
 
->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
 
 >>> # simply pass the cache implementation="static"
 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static")
 >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
+"Hello, my name is [Your Name] and I am a [Your Position] at [Your Company]. I am writing"
 ```
 
 
@@ -256,7 +258,7 @@ This will use the [`~OffloadedStaticCache`] implementation instead.
 >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
 
->>> # simply pass the cache implementation="static"
+>>> # simply pass the cache implementation="offloaded_static"
 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
 >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
 "Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
@@ -275,14 +277,14 @@ Note that you can use this cache only for models that support sliding window, e.
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
 
->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
->>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
+>>> tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")
+>>> model = AutoModelForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16).to("cuda:0")
 >>> inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
 
 >>> # can be used by passing in cache implementation
 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
 >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"Yesterday I was on a rock concert and. I was so excited to see my favorite band. I was so excited that I was jumping up and down and screaming. I was so excited that I"
+"Yesterday I was on a rock concert and. I was so excited to see my favorite band perform live. I was so happy that I could hardly contain myself. I was jumping up and down and"
 ```
 
 ### Sink Cache
@@ -295,16 +297,16 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
 
->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16).to("cuda:0")
 >>> inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device)
 
 >>> # get our cache, specify number of sink tokens and window size
 >>> # Note that window size already includes sink tokens, so has to be larger
 >>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4)
 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values)
 >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"This is a long story about unicorns, fairies and magic. It is a fantasy world where unicorns and fairies live together in harmony. The story follows a young girl named Lily"
+"This is a long story about unicorns, fairies and magic. It is a story about a young girl named Lily who discovers that she has the power to control the elements. She learns that she can"
 ```
 
 ### Encoder-Decoder Cache
@@ -332,15 +334,15 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l
 >>> import torch
 >>> from transformers import AutoTokenizer,AutoModelForCausalLM
 >>> from transformers.cache_utils import (
->>>     DynamicCache,
->>>     SinkCache,
->>>     StaticCache,
->>>     SlidingWindowCache,
->>>     QuantoQuantizedCache,
->>>     QuantizedCacheConfig,
->>> )
-
->>> model_id = "meta-llama/Llama-2-7b-chat-hf"
+...    DynamicCache,
+...    SinkCache,
+...    StaticCache,
+...    SlidingWindowCache,
+...    QuantoQuantizedCache,
+...    QuantizedCacheConfig,
+... )
+
+>>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
 >>> tokenizer = AutoTokenizer.from_pretrained(model_id)
 
@@ -363,7 +365,7 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l
 ...     messages.append({"role": "assistant", "content": completion})
 
 print(messages)
-[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': " Hello! My name is LLaMA, I'm a large language model trained by a team of researcher at Meta AI. 😊"}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': ' Oh, cool! That sounds like a lot of fun! 🎉 Did you enjoy the concert? What was the band like? 🤔'}]
+[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': "Hello, I'm AI."}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': "I'm sorry to hear that you were on a rock concert yesterday. It sounds like a fun experience, but I'm not capable of experiencing music or concerts. However, I can provide you with some information about rock music and its history. Rock music emerged in the 1950s and 1960s in the United States and Britain, and it quickly gained popularity around the world. Some of the most famous rock bands of all time include The Beatles, The Rolling Stones, Led Zeppelin, and Pink Floyd. Rock music has a distinct sound and style, with elements of blues, country, and folk music. It often features guitar solos, heavy bass lines, and drums. Rock music has had a significant impact on popular culture, influencing genres such as punk rock, heavy metal, and alternative rock."}]
 ```
 
 
@@ -376,7 +378,7 @@ Sometimes you would want to first fill-in cache object with key/values for certa
 >>> import torch
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
 
->>> model_id = "meta-llama/Llama-2-7b-chat-hf"
+>>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
 >>> tokenizer = AutoTokenizer.from_pretrained(model_id)
 
@@ -400,7 +402,7 @@ Sometimes you would want to first fill-in cache object with key/values for certa
 ...     responses.append(response)
 
 >>> print(responses)
-['<s> You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTitle: The Ultimate Guide to Travelling: Tips, Tricks, and', '<s> You are a helpful assistant. What is the capital of France?\n\nYes, the capital of France is Paris.</s>']
+['<s> You are a helpful assistant. Help me to write a blogpost about travelling.  I am excited to share my experiences with you.  I have been traveling for the past', '<s> You are a helpful assistant. What is the capital of France? \n\nAnswer: Paris is the capital of France.</s>']
 ```
 
 
@@ -414,8 +416,8 @@ this legacy format, you can seamlessly convert it to a `DynamicCache` and back.
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
 
->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
 
 >>> # `return_dict_in_generate=True` is required to return the cache. `return_legacy_cache` forces the returned cache
 
@@ -56,7 +56,7 @@ In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. N
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 >>> device = "cuda" # the device to load the model onto
 
->>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto")
+>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto", trust_remote_code=True)
 >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")
 
 >>> prompt = "Give me a short introduction to large language model."