From 415f0d2c7353b44833ecd65c49b9aa9ef54947ff Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 6 Jun 2023 21:16:21 +0100
Subject: [PATCH 01/87] Copies `modeling_flax_gpt_neo.py` to start

---
 .../models/llama/modeling_flax_llama.py       | 684 ++++++++++++++++++
 1 file changed, 684 insertions(+)
 create mode 100644 src/transformers/models/llama/modeling_flax_llama.py

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
new file mode 100644
index 00000000000000..0749911f7a15fa
--- /dev/null
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -0,0 +1,684 @@
+# coding=utf-8
+# Copyright 2021 The Eleuther AI and The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_gpt_neo import GPTNeoConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GPTNeoConfig"
+_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"
+
+
+GPT_NEO_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`GPTNeoConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+GPT_NEO_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class FlaxGPTNeoSelfAttention(nn.Module):
+    config: GPTNeoConfig
+    attention_type: str
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        config = self.config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and "
+                f"`num_heads`: {self.num_heads})."
+            )
+
+        self.attn_dropout = nn.Dropout(config.attention_dropout)
+        self.resid_dropout = nn.Dropout(config.resid_dropout)
+
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        self.q_proj, self.k_proj, self.v_proj = dense(use_bias=False), dense(use_bias=False), dense(use_bias=False)
+        self.out_proj = dense()
+
+        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
+        if self.attention_type == "local":
+            self.causal_mask = self.causal_mask ^ jnp.tril(self.causal_mask, -config.window_size)
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        query = self.q_proj(hidden_states) * jnp.sqrt(self.head_dim).astype(self.dtype)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query)
+        key = self._split_heads(key)
+        value = self._split_heads(value)
+
+        query_length, key_length = query.shape[1], key.shape[1]
+
+        if self.has_variable("cache", "cached_key"):
+            mask_shift = self.variables["cache"]["cache_index"]
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_mask = lax.dynamic_slice(
+                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+            )
+        else:
+            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+
+        batch_size = hidden_states.shape[0]
+        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_mask)
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+
+        # transform boolean mask into float mask
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+        )
+
+        # usual dot product attention
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_dropout,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxGPTNeoAttention(nn.Module):
+    config: GPTNeoConfig
+    layer_id: int = 0
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        attention_type = self.config.attention_layers[self.layer_id]
+        self.attention = FlaxGPTNeoSelfAttention(self.config, attention_type, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        return self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+
+
+class FlaxGPTNeoMLP(nn.Module):
+    config: GPTNeoConfig
+    intermediate_size: int
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+        self.c_fc = nn.Dense(self.intermediate_size, dtype=self.dtype, kernel_init=kernel_init)
+        self.c_proj = nn.Dense(embed_dim, dtype=self.dtype, kernel_init=kernel_init)
+        self.act = ACT2FN[self.config.activation_function]
+        self.dropout = nn.Dropout(rate=self.config.resid_dropout)
+
+    def __call__(self, hidden_states, deterministic: bool = True):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        return hidden_states
+
+
+class FlaxGPTNeoBlock(nn.Module):
+    config: GPTNeoConfig
+    layer_id: int = 0
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        hidden_size = self.config.hidden_size
+        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+        self.attn = FlaxGPTNeoAttention(self.config, layer_id=self.layer_id, dtype=self.dtype)
+        self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+        self.mlp = FlaxGPTNeoMLP(self.config, inner_dim, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        outputs = self.attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+        # residual connection
+        attn_output = outputs[0]
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        return (hidden_states,) + outputs[1:]
+
+
+class FlaxGPTNeoPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPTNeoConfig
+    base_model_prefix = "transformer"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: GPTNeoConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        params: dict = None,
+        past_key_values: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        batch_size, sequence_length = input_ids.shape
+
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPTNeoAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+class FlaxGPTNeoBlockCollection(nn.Module):
+    config: GPTNeoConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.blocks = [
+            FlaxGPTNeoBlock(self.config, layer_id=i, name=str(i), dtype=self.dtype)
+            for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = block(
+                hidden_states,
+                attention_mask,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        # this contains possible `None` values - `FlaxGPTNeoModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        return outputs
+
+
+class FlaxGPTNeoModule(nn.Module):
+    config: GPTNeoConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.embed_dim = self.config.hidden_size
+        embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
+        self.wte = nn.Embed(
+            self.config.vocab_size,
+            self.embed_dim,
+            embedding_init=embedding_init,
+        )
+        self.wpe = nn.Embed(
+            self.config.max_position_embeddings,
+            self.embed_dim,
+            embedding_init=embedding_init,
+        )
+        self.dropout = nn.Dropout(rate=self.config.embed_dropout)
+        self.h = FlaxGPTNeoBlockCollection(self.config, dtype=self.dtype)
+        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.wte(input_ids.astype("i4"))
+        position_embeds = self.wpe(position_ids.astype("i4"))
+
+        hidden_states = input_embeds + position_embeds
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+
+        outputs = self.h(
+            hidden_states,
+            attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = outputs[0]
+        hidden_states = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+
+
+@add_start_docstrings(
+    "The bare GPTNeo Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT_NEO_START_DOCSTRING,
+)
+class FlaxGPTNeoModel(FlaxGPTNeoPreTrainedModel):
+    module_class = FlaxGPTNeoModule
+
+
+append_call_sample_docstring(FlaxGPTNeoModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxGPTNeoForCausalLMModule(nn.Module):
+    config: GPTNeoConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.transformer = FlaxGPTNeoModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.transformer(
+            input_ids,
+            attention_mask,
+            position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    The GPTNeo Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    GPT_NEO_START_DOCSTRING,
+)
+class FlaxGPTNeoForCausalLM(FlaxGPTNeoPreTrainedModel):
+    module_class = FlaxGPTNeoForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since GPTNeo uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(FlaxGPTNeoForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)

From 76a599c82c104c470ff2ff452fddee81a924492b Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 6 Jun 2023 22:48:31 +0100
Subject: [PATCH 02/87] MLP Block. WIP Attention and Block

---
 .../models/llama/modeling_flax_llama.py       | 231 ++++++++++--------
 1 file changed, 128 insertions(+), 103 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 0749911f7a15fa..e25822db7c8c5f 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -19,6 +19,7 @@
 import flax.linen as nn
 import jax
 import jax.numpy as jnp
+import numpy as np
 from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
 from flax.linen import combine_masks, make_causal_mask
 from flax.linen.attention import dot_product_attention_weights
@@ -26,18 +27,17 @@
 from jax import lax
 
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
-from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
-from .configuration_gpt_neo import GPTNeoConfig
+from .configuration_llama import LlamaConfig
 
 
 logger = logging.get_logger(__name__)
 
-_CONFIG_FOR_DOC = "GPTNeoConfig"
-_CHECKPOINT_FOR_DOC = "EleutherAI/gpt-neo-1.3B"
+_CONFIG_FOR_DOC = "LlamaConfig"
 
 
-GPT_NEO_START_DOCSTRING = r"""
+LLAMA_START_DOCSTRING = r"""
 
     This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
@@ -55,27 +55,16 @@
     - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
 
     Parameters:
-        config ([`GPTNeoConfig`]): Model configuration class with all the parameters of the model.
+        config ([`LlamaConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
-        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
-            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
-            `jax.numpy.bfloat16` (on TPUs).
-
-            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
-            specified all the computation will be performed with the given `dtype`.
-
-            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
-            parameters.**
-
-            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
-            [`~FlaxPreTrainedModel.to_bf16`].
 """
 
-GPT_NEO_INPUTS_DOCSTRING = r"""
+LLAMA_INPUTS_DOCSTRING = r"""
     Args:
         input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
-            `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary.
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
 
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
@@ -88,12 +77,34 @@
             - 0 for tokens that are **masked**.
 
             [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
         position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.max_position_embeddings - 1]`.
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
         past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
             Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
             auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        inputs_embeds (`np.array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -105,9 +116,34 @@
 """
 
 
-class FlaxGPTNeoSelfAttention(nn.Module):
-    config: GPTNeoConfig
-    attention_type: str
+def create_sinusoidal_positions(num_pos, dim):
+    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
+    sinusoid_inp = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
+    sin, cos = np.sin(sinusoid_inp), np.cos(sinusoid_inp)
+
+    sentinel = dim // 2 + dim % 2
+    out = np.zeros((num_pos, dim))
+    out[:, 0:sentinel] = sin
+    out[:, sentinel:] = cos
+
+    return jnp.array(out)
+
+
+def rotate_every_two(tensor):
+    rotate_half_tensor = jnp.stack((-tensor[:, :, :, 1::2], tensor[:, :, :, ::2]), axis=-1)
+    rotate_half_tensor = rotate_half_tensor.reshape(rotate_half_tensor.shape[:-2] + (-1,))
+    return rotate_half_tensor
+
+
+def apply_rotary_pos_emb(tensor, sincos):
+    sin_pos, cos_pos = sincos
+    sin_pos = sin_pos[:, :, None, :].repeat(2, 3)
+    cos_pos = cos_pos[:, :, None, :].repeat(2, 3)
+    return (tensor * cos_pos) + (rotate_every_two(tensor) * sin_pos)
+
+
+class FlaxLlamaAttention(nn.Module):
+    config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
@@ -115,28 +151,27 @@ def setup(self):
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
+        self.rotary_dim = config.rotary_dim
+
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and "
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and "
                 f"`num_heads`: {self.num_heads})."
             )
 
-        self.attn_dropout = nn.Dropout(config.attention_dropout)
-        self.resid_dropout = nn.Dropout(config.resid_dropout)
-
         dense = partial(
             nn.Dense,
             self.embed_dim,
+            use_bias=False,
             dtype=self.dtype,
             kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
         )
-
-        self.q_proj, self.k_proj, self.v_proj = dense(use_bias=False), dense(use_bias=False), dense(use_bias=False)
-        self.out_proj = dense()
+        self.q_proj, self.k_proj, self.v_proj, self.o_proj = [dense() for _ in range(4)]
 
         self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
-        if self.attention_type == "local":
-            self.causal_mask = self.causal_mask ^ jnp.tril(self.causal_mask, -config.window_size)
+
+        pos_embd_dim = self.rotary_dim or self.embed_dim
+        self.embed_positions = create_sinusoidal_positions(config.max_position_embeddings, pos_embd_dim)
 
     def _split_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
@@ -176,6 +211,7 @@ def _concatenate_to_cache(self, key, value, query, attention_mask):
             attention_mask = combine_masks(pad_mask, attention_mask)
         return key, value, attention_mask
 
+    # TODO: update this call to add rotary and any other changes
     def __call__(
         self,
         hidden_states,
@@ -246,55 +282,29 @@ def __call__(
         return outputs
 
 
-class FlaxGPTNeoAttention(nn.Module):
-    config: GPTNeoConfig
-    layer_id: int = 0
-    dtype: jnp.dtype = jnp.float32
-
-    def setup(self):
-        attention_type = self.config.attention_layers[self.layer_id]
-        self.attention = FlaxGPTNeoSelfAttention(self.config, attention_type, dtype=self.dtype)
-
-    def __call__(
-        self,
-        hidden_states,
-        attention_mask=None,
-        deterministic: bool = True,
-        init_cache: bool = False,
-        output_attentions: bool = False,
-    ):
-        return self.attention(
-            hidden_states,
-            attention_mask=attention_mask,
-            deterministic=deterministic,
-            init_cache=init_cache,
-            output_attentions=output_attentions,
-        )
-
-
-class FlaxGPTNeoMLP(nn.Module):
-    config: GPTNeoConfig
+class FlaxLlamaMLP(nn.Module):
+    config: LlamaConfig
     intermediate_size: int
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
         embed_dim = self.config.hidden_size
-        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
-        self.c_fc = nn.Dense(self.intermediate_size, dtype=self.dtype, kernel_init=kernel_init)
-        self.c_proj = nn.Dense(embed_dim, dtype=self.dtype, kernel_init=kernel_init)
-        self.act = ACT2FN[self.config.activation_function]
-        self.dropout = nn.Dropout(rate=self.config.resid_dropout)
-
-    def __call__(self, hidden_states, deterministic: bool = True):
-        hidden_states = self.c_fc(hidden_states)
+        jax.nn.initializers.normal(self.config.initializer_range)
+        self.act = ACT2FN[self.config.hidden_act]
+
+        self.gate_proj = nn.Dense(self.intermediate_size, use_bias=False)
+        self.down_proj = nn.Dense(embed_dim, use_bias=False)
+        self.up_proj = nn.Dense(self.intermediate_size, use_bias=False)
+
+    def __call__(self, hidden_states):
+        hidden_states = self.up_proj(hidden_states) * self.gate_proj(hidden_states)
         hidden_states = self.act(hidden_states)
-        hidden_states = self.c_proj(hidden_states)
-        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.down_proj(hidden_states)
         return hidden_states
 
 
-class FlaxGPTNeoBlock(nn.Module):
-    config: GPTNeoConfig
+class FlaxLlamaBlock(nn.Module):
+    config: LlamaConfig
     layer_id: int = 0
     dtype: jnp.dtype = jnp.float32
 
@@ -302,10 +312,10 @@ def setup(self):
         hidden_size = self.config.hidden_size
         inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
 
-        self.ln_1 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
-        self.attn = FlaxGPTNeoAttention(self.config, layer_id=self.layer_id, dtype=self.dtype)
-        self.ln_2 = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
-        self.mlp = FlaxGPTNeoMLP(self.config, inner_dim, dtype=self.dtype)
+        self.ln_1 = nn.RMSNorm(epsilon=self.config.rms_norm_eps, dtype=self.dtype)
+        self.attn = FlaxLlamaAttention(self.config, dtype=self.dtype)
+        self.ln_2 = nn.RMSNorm(epsilon=self.config.rms_norm_eps, dtype=self.dtype)
+        self.mlp = FlaxLlamaMLP(self.config, inner_dim, dtype=self.dtype)
 
     def __call__(
         self,
@@ -337,19 +347,19 @@ def __call__(
         return (hidden_states,) + outputs[1:]
 
 
-class FlaxGPTNeoPreTrainedModel(FlaxPreTrainedModel):
+class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
     """
 
-    config_class = GPTNeoConfig
+    config_class = LlamaConfig
     base_model_prefix = "transformer"
     module_class: nn.Module = None
 
     def __init__(
         self,
-        config: GPTNeoConfig,
+        config: LlamaConfig,
         input_shape: Tuple = (1, 1),
         seed: int = 0,
         dtype: jnp.dtype = jnp.float32,
@@ -398,7 +408,7 @@ def init_cache(self, batch_size, max_length):
         )
         return unfreeze(init_variables["cache"])
 
-    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
     def __call__(
         self,
         input_ids,
@@ -436,7 +446,7 @@ def __call__(
 
         inputs = {"params": params or self.params}
 
-        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxGPTNeoAttention module
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxLlamaNeoAttention module
         if past_key_values:
             inputs["cache"] = past_key_values
             mutable = ["cache"]
@@ -469,13 +479,13 @@ def __call__(
         return outputs
 
 
-class FlaxGPTNeoBlockCollection(nn.Module):
-    config: GPTNeoConfig
+class FlaxLlamaBlockCollection(nn.Module):
+    config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
         self.blocks = [
-            FlaxGPTNeoBlock(self.config, layer_id=i, name=str(i), dtype=self.dtype)
+            FlaxLlamaBlock(self.config, layer_id=i, name=str(i), dtype=self.dtype)
             for i in range(self.config.num_hidden_layers)
         ]
 
@@ -508,14 +518,14 @@ def __call__(
             if output_attentions:
                 all_attentions += (layer_outputs[1],)
 
-        # this contains possible `None` values - `FlaxGPTNeoModule` will filter them out
+        # this contains possible `None` values - `FlaxLlamaModule` will filter them out
         outputs = (hidden_states, all_hidden_states, all_attentions)
 
         return outputs
 
 
-class FlaxGPTNeoModule(nn.Module):
-    config: GPTNeoConfig
+class FlaxLlamaModule(nn.Module):
+    config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
@@ -532,7 +542,7 @@ def setup(self):
             embedding_init=embedding_init,
         )
         self.dropout = nn.Dropout(rate=self.config.embed_dropout)
-        self.h = FlaxGPTNeoBlockCollection(self.config, dtype=self.dtype)
+        self.h = FlaxLlamaBlockCollection(self.config, dtype=self.dtype)
         self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
 
     def __call__(
@@ -585,22 +595,22 @@ def __call__(
 
 
 @add_start_docstrings(
-    "The bare GPTNeo Model transformer outputting raw hidden-states without any specific head on top.",
-    GPT_NEO_START_DOCSTRING,
+    "The bare Llama Model transformer outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
 )
-class FlaxGPTNeoModel(FlaxGPTNeoPreTrainedModel):
-    module_class = FlaxGPTNeoModule
+class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
+    module_class = FlaxLlamaModule
 
 
-append_call_sample_docstring(FlaxGPTNeoModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
+# append_call_sample_docstring(FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
 
 
-class FlaxGPTNeoForCausalLMModule(nn.Module):
-    config: GPTNeoConfig
+class FlaxLlamaForCausalLMModule(nn.Module):
+    config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
-        self.transformer = FlaxGPTNeoModule(self.config, dtype=self.dtype)
+        self.transformer = FlaxLlamaModule(self.config, dtype=self.dtype)
         self.lm_head = nn.Dense(
             self.config.vocab_size,
             use_bias=False,
@@ -646,13 +656,13 @@ def __call__(
 
 @add_start_docstrings(
     """
-    The GPTNeo Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    The Llama Model transformer with a language modeling head on top (linear layer with weights tied to the input
     embeddings).
     """,
-    GPT_NEO_START_DOCSTRING,
+    LLAMA_START_DOCSTRING,
 )
-class FlaxGPTNeoForCausalLM(FlaxGPTNeoPreTrainedModel):
-    module_class = FlaxGPTNeoForCausalLMModule
+class FlaxLlamaForCausalLM(FlaxLlamaPreTrainedModel):
+    module_class = FlaxLlamaForCausalLMModule
 
     def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
         # initializing the cache
@@ -660,7 +670,7 @@ def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: O
 
         past_key_values = self.init_cache(batch_size, max_length)
         # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
-        # But since GPTNeo uses a causal mask, those positions are masked anyways.
+        # But since Llama uses a causal mask, those positions are masked anyways.
         # Thus we can create a single static attention_mask here, which is more efficient for compilation
         extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
         if attention_mask is not None:
@@ -681,4 +691,19 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         return model_kwargs
 
 
-append_call_sample_docstring(FlaxGPTNeoForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
+# append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
+
+if __name__ == "__main__":
+    from .configuration_llama import LlamaConfig
+
+    key = jax.random.PRNGKey(0)
+    config = LlamaConfig()
+
+    model = FlaxLlamaMLP(config, 4 * config.intermediate_size)
+    x = jnp.zeros((4, 128, config.hidden_size))
+
+    key, model_key = jax.random.split(key)
+    params = model.init(model_key, x)
+
+    y = model.apply(params, x)
+    print("done")

From 4f5654d4fb940913cc1ab3aef5247f214063d6d5 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Wed, 21 Jun 2023 09:04:59 +0100
Subject: [PATCH 03/87] Adds Flax implementation of `LlamaMLP` Validated with
 in-file test. Some slight numeric differences, but assuming it isn't an issue

---
 .../models/llama/modeling_flax_llama.py       | 31 +++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index e25822db7c8c5f..86486d0714b1a7 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -297,8 +297,7 @@ def setup(self):
         self.up_proj = nn.Dense(self.intermediate_size, use_bias=False)
 
     def __call__(self, hidden_states):
-        hidden_states = self.up_proj(hidden_states) * self.gate_proj(hidden_states)
-        hidden_states = self.act(hidden_states)
+        hidden_states = self.up_proj(hidden_states) * self.act(self.gate_proj(hidden_states))
         hidden_states = self.down_proj(hidden_states)
         return hidden_states
 
@@ -694,16 +693,42 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 # append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
 
 if __name__ == "__main__":
+    import flax
+    from flax.traverse_util import flatten_dict
     from .configuration_llama import LlamaConfig
+    from .modeling_llama import LlamaMLP
+    import torch
 
     key = jax.random.PRNGKey(0)
     config = LlamaConfig()
 
     model = FlaxLlamaMLP(config, 4 * config.intermediate_size)
-    x = jnp.zeros((4, 128, config.hidden_size))
+    pt_model = LlamaMLP(config.hidden_size, 4 * config.intermediate_size, config.hidden_act)
+
+    key, subkey = jax.random.split(key)
+    x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 0.1
 
     key, model_key = jax.random.split(key)
     params = model.init(model_key, x)
 
     y = model.apply(params, x)
+
+    params = flatten_dict(params, sep='.')
+    pt_model.load_state_dict({
+        'gate_proj.weight': torch.from_numpy(np.asarray(params['params.gate_proj.kernel'])).T,
+        'down_proj.weight': torch.from_numpy(np.asarray(params['params.down_proj.kernel'])).T,
+        'up_proj.weight': torch.from_numpy(np.asarray(params['params.up_proj.kernel'])).T,
+    })
+    x = torch.tensor(np.asarray(x))
+    pt_y = pt_model(x)
+
+    y = np.asarray(y)
+    pt_y = pt_y.detach().numpy()
+
+    try:
+        np.testing.assert_allclose(y, pt_y, atol=1e-4, rtol=1e-4)
+    except AssertionError as e:
+        import ipdb; ipdb.set_trace()
+
+    print(config)
     print("done")

From 36c48fab40d019f949fede35fb7f52c8a3ca3f24 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Wed, 21 Jun 2023 09:20:54 +0100
Subject: [PATCH 04/87] Adds `FlaxLlamaRMSNorm` layer `flax.linen` includes
 `RMSNorm` layer but not necessarily in all versions. Hence, we add in-file.

---
 .../models/llama/modeling_flax_llama.py       | 35 +++++++++++++------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 86486d0714b1a7..d255b187443ce1 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -141,6 +141,21 @@ def apply_rotary_pos_emb(tensor, sincos):
     cos_pos = cos_pos[:, :, None, :].repeat(2, 3)
     return (tensor * cos_pos) + (rotate_every_two(tensor) * sin_pos)
 
+class FlaxLlamaRMSNorm(nn.Module):
+    eps: float = 1e-6
+
+    @nn.compact
+    def __call__(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = jnp.asarray(hidden_states, dtype=jnp.float32)
+        variance = jnp.square(variance)
+        variance = variance.mean(-1, keepdims=True)
+        hidden_states = hidden_states * jax.lax.rsqrt(variance + self.eps)
+
+        weight = self.param('weight', lambda _, shape: jnp.ones(shape), hidden_states.shape[-1])
+
+        return jnp.asarray(weight * hidden_states, dtype=input_dtype)
+
 
 class FlaxLlamaAttention(nn.Module):
     config: LlamaConfig
@@ -311,9 +326,9 @@ def setup(self):
         hidden_size = self.config.hidden_size
         inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
 
-        self.ln_1 = nn.RMSNorm(epsilon=self.config.rms_norm_eps, dtype=self.dtype)
+        self.ln_1 = FlaxLlamaRMSNorm(epsilon=self.config.rms_norm_eps, dtype=self.dtype)
         self.attn = FlaxLlamaAttention(self.config, dtype=self.dtype)
-        self.ln_2 = nn.RMSNorm(epsilon=self.config.rms_norm_eps, dtype=self.dtype)
+        self.ln_2 = FlaxLlamaRMSNorm(epsilon=self.config.rms_norm_eps, dtype=self.dtype)
         self.mlp = FlaxLlamaMLP(self.config, inner_dim, dtype=self.dtype)
 
     def __call__(
@@ -696,28 +711,26 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     import flax
     from flax.traverse_util import flatten_dict
     from .configuration_llama import LlamaConfig
-    from .modeling_llama import LlamaMLP
+    from .modeling_llama import LlamaRMSNorm
     import torch
 
     key = jax.random.PRNGKey(0)
     config = LlamaConfig()
 
-    model = FlaxLlamaMLP(config, 4 * config.intermediate_size)
-    pt_model = LlamaMLP(config.hidden_size, 4 * config.intermediate_size, config.hidden_act)
+    model = FlaxLlamaRMSNorm(eps=1e-6)
+    pt_model = LlamaRMSNorm(config.hidden_size, eps=1e-6)
 
     key, subkey = jax.random.split(key)
-    x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 0.1
+    x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 1.0
 
     key, model_key = jax.random.split(key)
     params = model.init(model_key, x)
 
     y = model.apply(params, x)
 
-    params = flatten_dict(params, sep='.')
+    params = flatten_dict(params['params'], sep='.')
     pt_model.load_state_dict({
-        'gate_proj.weight': torch.from_numpy(np.asarray(params['params.gate_proj.kernel'])).T,
-        'down_proj.weight': torch.from_numpy(np.asarray(params['params.down_proj.kernel'])).T,
-        'up_proj.weight': torch.from_numpy(np.asarray(params['params.up_proj.kernel'])).T,
+        'weight': torch.from_numpy(np.asarray(params['weight'])),
     })
     x = torch.tensor(np.asarray(x))
     pt_y = pt_model(x)
@@ -726,7 +739,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     pt_y = pt_y.detach().numpy()
 
     try:
-        np.testing.assert_allclose(y, pt_y, atol=1e-4, rtol=1e-4)
+        np.testing.assert_allclose(y, pt_y, atol=1e-5, rtol=1e-5)
     except AssertionError as e:
         import ipdb; ipdb.set_trace()
 

From a38b0973842a327f44784e103a2ce93413bb6161 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 23 Jun 2023 17:52:18 +0100
Subject: [PATCH 05/87] Adds FlaxLlamaAttention Copied from GPT-J as it has
 efficient caching implementation as well as rotary embeddings. Notice
 numerically different, but not by a huge amount. Needs investigating

---
 .../models/llama/modeling_flax_llama.py       | 76 ++++++++++++-------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index d255b187443ce1..04980fcdde81e8 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -115,7 +115,6 @@
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
-
 def create_sinusoidal_positions(num_pos, dim):
     inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
     sinusoid_inp = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
@@ -141,6 +140,7 @@ def apply_rotary_pos_emb(tensor, sincos):
     cos_pos = cos_pos[:, :, None, :].repeat(2, 3)
     return (tensor * cos_pos) + (rotate_every_two(tensor) * sin_pos)
 
+
 class FlaxLlamaRMSNorm(nn.Module):
     eps: float = 1e-6
 
@@ -160,19 +160,16 @@ def __call__(self, hidden_states):
 class FlaxLlamaAttention(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
+    causal: bool = True
+    is_cross_attention: bool = False
 
     def setup(self):
         config = self.config
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
-        self.rotary_dim = config.rotary_dim
 
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and "
-                f"`num_heads`: {self.num_heads})."
-            )
+        self.rotary_dim = self.head_dim
 
         dense = partial(
             nn.Dense,
@@ -181,7 +178,9 @@ def setup(self):
             dtype=self.dtype,
             kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
         )
-        self.q_proj, self.k_proj, self.v_proj, self.o_proj = [dense() for _ in range(4)]
+
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.o_proj = dense()
 
         self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
 
@@ -226,16 +225,16 @@ def _concatenate_to_cache(self, key, value, query, attention_mask):
             attention_mask = combine_masks(pad_mask, attention_mask)
         return key, value, attention_mask
 
-    # TODO: update this call to add rotary and any other changes
     def __call__(
         self,
         hidden_states,
-        attention_mask=None,
+        attention_mask,
+        position_ids,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
-        query = self.q_proj(hidden_states) * jnp.sqrt(self.head_dim).astype(self.dtype)
+        query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
 
@@ -243,6 +242,24 @@ def __call__(
         key = self._split_heads(key)
         value = self._split_heads(value)
 
+        sincos = jnp.take(self.embed_positions, position_ids, axis=0)
+        sincos = jnp.split(sincos, 2, axis=-1)
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+
+            k_rot = apply_rotary_pos_emb(k_rot, sincos)
+            q_rot = apply_rotary_pos_emb(q_rot, sincos)
+
+            key = jnp.concatenate([k_rot, k_pass], axis=-1)
+            query = jnp.concatenate([q_rot, q_pass], axis=-1)
+        else:
+            key = apply_rotary_pos_emb(key, sincos)
+            query = apply_rotary_pos_emb(query, sincos)
+
         query_length, key_length = query.shape[1], key.shape[1]
 
         if self.has_variable("cache", "cached_key"):
@@ -260,10 +277,6 @@ def __call__(
         attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
         attention_mask = combine_masks(attention_mask, causal_mask)
 
-        dropout_rng = None
-        if not deterministic and self.config.attention_dropout > 0.0:
-            dropout_rng = self.make_rng("dropout")
-
         # During fast autoregressive decoding, we feed one position at a time,
         # and cache the keys and values step by step.
         if self.has_variable("cache", "cached_key") or init_cache:
@@ -281,8 +294,6 @@ def __call__(
             query,
             key,
             bias=attention_bias,
-            dropout_rng=dropout_rng,
-            dropout_rate=self.config.attention_dropout,
             deterministic=deterministic,
             dtype=self.dtype,
             precision=None,
@@ -290,8 +301,7 @@ def __call__(
 
         attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
         attn_output = self._merge_heads(attn_output)
-        attn_output = self.out_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output, deterministic=deterministic)
+        attn_output = self.o_proj(attn_output)
 
         outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
         return outputs
@@ -711,37 +721,47 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     import flax
     from flax.traverse_util import flatten_dict
     from .configuration_llama import LlamaConfig
-    from .modeling_llama import LlamaRMSNorm
+    from .modeling_llama import LlamaAttention, _make_causal_mask
     import torch
 
     key = jax.random.PRNGKey(0)
     config = LlamaConfig()
 
-    model = FlaxLlamaRMSNorm(eps=1e-6)
-    pt_model = LlamaRMSNorm(config.hidden_size, eps=1e-6)
+    model = FlaxLlamaAttention(config)
+    pt_model = LlamaAttention(config)
 
     key, subkey = jax.random.split(key)
-    x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 1.0
+    x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 0.1
+    mask = jnp.ones((4, 128), dtype=bool)
+    position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(4, axis=0)
 
     key, model_key = jax.random.split(key)
-    params = model.init(model_key, x)
+    params = model.init(model_key, x, mask, position_ids)
 
-    y = model.apply(params, x)
+    y, = model.apply(params, x, mask, position_ids)
 
     params = flatten_dict(params['params'], sep='.')
+    pt_state = pt_model.state_dict()
     pt_model.load_state_dict({
-        'weight': torch.from_numpy(np.asarray(params['weight'])),
+        'q_proj.weight': torch.from_numpy(np.asarray(params['q_proj.kernel'])).T,
+        'k_proj.weight': torch.from_numpy(np.asarray(params['k_proj.kernel'])).T,
+        'v_proj.weight': torch.from_numpy(np.asarray(params['v_proj.kernel'])).T,
+        'o_proj.weight': torch.from_numpy(np.asarray(params['o_proj.kernel'])).T,
+        'rotary_emb.inv_freq': pt_state['rotary_emb.inv_freq']
     })
     x = torch.tensor(np.asarray(x))
-    pt_y = pt_model(x)
+    # import ipdb; ipdb.set_trace()
+    pt_y = pt_model(x, _make_causal_mask((4, 128), torch.float32, device='cpu'), torch.from_numpy(np.asarray(position_ids)))[0]
 
     y = np.asarray(y)
     pt_y = pt_y.detach().numpy()
 
     try:
-        np.testing.assert_allclose(y, pt_y, atol=1e-5, rtol=1e-5)
+        np.testing.assert_allclose(y, pt_y, atol=1e-3, rtol=1e-3)
     except AssertionError as e:
+        print(e)
         import ipdb; ipdb.set_trace()
 
+
     print(config)
     print("done")

From 573866646657b036c3770124050e3e8ee3a13cf2 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 25 Jun 2023 07:36:16 +0100
Subject: [PATCH 06/87] Adds `FlaxLlamaDecoderLayer` numerically inaccurate,
 debugging..

---
 .../models/llama/modeling_flax_llama.py       | 48 +++++++++++--------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 04980fcdde81e8..a42b8d290591bf 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -176,7 +176,7 @@ def setup(self):
             self.embed_dim,
             use_bias=False,
             dtype=self.dtype,
-            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            # kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
         )
 
         self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
@@ -327,32 +327,35 @@ def __call__(self, hidden_states):
         return hidden_states
 
 
-class FlaxLlamaBlock(nn.Module):
+# TODO: make sure attention output format is same as Pytorch
+# for now, we just worry about model numerics
+class FlaxLlamaDecoderLayer(nn.Module):
     config: LlamaConfig
-    layer_id: int = 0
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
         hidden_size = self.config.hidden_size
         inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
 
-        self.ln_1 = FlaxLlamaRMSNorm(epsilon=self.config.rms_norm_eps, dtype=self.dtype)
-        self.attn = FlaxLlamaAttention(self.config, dtype=self.dtype)
-        self.ln_2 = FlaxLlamaRMSNorm(epsilon=self.config.rms_norm_eps, dtype=self.dtype)
+        self.input_layernorm = FlaxLlamaRMSNorm(eps=self.config.rms_norm_eps)
+        self.self_attn = FlaxLlamaAttention(self.config, dtype=self.dtype)
+        self.post_attention_layernorm = FlaxLlamaRMSNorm(eps=self.config.rms_norm_eps)
         self.mlp = FlaxLlamaMLP(self.config, inner_dim, dtype=self.dtype)
 
     def __call__(
         self,
         hidden_states,
+        position_ids = None,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
         residual = hidden_states
-        hidden_states = self.ln_1(hidden_states)
-        outputs = self.attn(
+        hidden_states = self.input_layernorm(hidden_states)
+        outputs = self.self_attn(
             hidden_states,
+            position_ids=position_ids,
             attention_mask=attention_mask,
             deterministic=deterministic,
             init_cache=init_cache,
@@ -363,10 +366,10 @@ def __call__(
         hidden_states = attn_output + residual
 
         residual = hidden_states
-        hidden_states = self.ln_2(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states, deterministic=deterministic)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
         # residual connection
-        hidden_states = residual + feed_forward_hidden_states
+        hidden_states = residual + hidden_states
 
         return (hidden_states,) + outputs[1:]
 
@@ -721,14 +724,15 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     import flax
     from flax.traverse_util import flatten_dict
     from .configuration_llama import LlamaConfig
-    from .modeling_llama import LlamaAttention, _make_causal_mask
+    from .modeling_llama import LlamaDecoderLayer, _make_causal_mask
     import torch
 
     key = jax.random.PRNGKey(0)
+    torch.manual_seed(0)
     config = LlamaConfig()
 
-    model = FlaxLlamaAttention(config)
-    pt_model = LlamaAttention(config)
+    model = FlaxLlamaDecoderLayer(config)
+    pt_model = LlamaDecoderLayer(config)
 
     key, subkey = jax.random.split(key)
     x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 0.1
@@ -743,14 +747,18 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     params = flatten_dict(params['params'], sep='.')
     pt_state = pt_model.state_dict()
     pt_model.load_state_dict({
-        'q_proj.weight': torch.from_numpy(np.asarray(params['q_proj.kernel'])).T,
-        'k_proj.weight': torch.from_numpy(np.asarray(params['k_proj.kernel'])).T,
-        'v_proj.weight': torch.from_numpy(np.asarray(params['v_proj.kernel'])).T,
-        'o_proj.weight': torch.from_numpy(np.asarray(params['o_proj.kernel'])).T,
-        'rotary_emb.inv_freq': pt_state['rotary_emb.inv_freq']
+        'self_attn.q_proj.weight': torch.from_numpy(np.asarray(params['self_attn.q_proj.kernel'])).T,
+        'self_attn.k_proj.weight': torch.from_numpy(np.asarray(params['self_attn.k_proj.kernel'])).T,
+        'self_attn.v_proj.weight': torch.from_numpy(np.asarray(params['self_attn.v_proj.kernel'])).T,
+        'self_attn.o_proj.weight': torch.from_numpy(np.asarray(params['self_attn.o_proj.kernel'])).T,
+        'self_attn.rotary_emb.inv_freq': pt_state['self_attn.rotary_emb.inv_freq'],
+        'input_layernorm.weight': torch.from_numpy(np.asarray(params['input_layernorm.weight'])),
+        'post_attention_layernorm.weight': torch.from_numpy(np.asarray(params['post_attention_layernorm.weight'])),
+        'mlp.down_proj.weight': torch.from_numpy(np.asarray(params['mlp.down_proj.kernel'])).T,
+        'mlp.up_proj.weight': torch.from_numpy(np.asarray(params['mlp.up_proj.kernel'])).T,
+        'mlp.gate_proj.weight': torch.from_numpy(np.asarray(params['mlp.gate_proj.kernel'])).T,
     })
     x = torch.tensor(np.asarray(x))
-    # import ipdb; ipdb.set_trace()
     pt_y = pt_model(x, _make_causal_mask((4, 128), torch.float32, device='cpu'), torch.from_numpy(np.asarray(position_ids)))[0]
 
     y = np.asarray(y)

From 401a72f96ecdf42aeab0104e8e86324efcf37cd4 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 25 Jun 2023 09:53:55 +0100
Subject: [PATCH 07/87] debugging rotary mismatch gptj uses interleaved whilst
 llama uses contiguous i think they match now but still final result is wrong.
 maybe drop back to just debugging attention layer?

---
 .../models/llama/modeling_flax_llama.py       | 91 ++++++++++++-------
 1 file changed, 58 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index a42b8d290591bf..a743d50c023d96 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -117,28 +117,39 @@
 
 def create_sinusoidal_positions(num_pos, dim):
     inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
-    sinusoid_inp = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
-    sin, cos = np.sin(sinusoid_inp), np.cos(sinusoid_inp)
+    freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
 
-    sentinel = dim // 2 + dim % 2
-    out = np.zeros((num_pos, dim))
-    out[:, 0:sentinel] = sin
-    out[:, sentinel:] = cos
+    # sin, cos = np.sin(freqs), np.cos(freqs)
+    # sentinel = dim // 2 + dim % 2
+    # out = np.zeros((num_pos, dim))
+    # out[:, 0:sentinel] = sin
+    # out[:, sentinel:] = cos
 
-    return jnp.array(out)
+    emb = np.concatenate((freqs, freqs), axis=-1)
 
+    out = np.concatenate((
+        np.sin(emb)[:, None, :],
+        np.cos(emb)[:, None, :]
+    ), axis=-1)
+    return jnp.array(out[:, :, :num_pos]) # TODO: don't think slice is needed
 
-def rotate_every_two(tensor):
-    rotate_half_tensor = jnp.stack((-tensor[:, :, :, 1::2], tensor[:, :, :, ::2]), axis=-1)
-    rotate_half_tensor = rotate_half_tensor.reshape(rotate_half_tensor.shape[:-2] + (-1,))
+
+# def rotate_every_two(tensor):
+    # rotate_half_tensor = jnp.stack((-tensor[:, :, :, 1::2], tensor[:, :, :, ::2]), axis=-1)
+    # rotate_half_tensor = rotate_half_tensor.reshape(rotate_half_tensor.shape[:-2] + (-1,))
+    # return rotate_half_tensor
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    rotate_half_tensor = jnp.concatenate((-x[..., x.shape[-1] // 2:], x[..., :x.shape[-1] // 2]), axis=-1)
+    # rotate_half_tensor = rotate_half_tensor.reshape(rotate_half_tensor.shape[:-2] + (-1,))
     return rotate_half_tensor
 
 
 def apply_rotary_pos_emb(tensor, sincos):
     sin_pos, cos_pos = sincos
-    sin_pos = sin_pos[:, :, None, :].repeat(2, 3)
-    cos_pos = cos_pos[:, :, None, :].repeat(2, 3)
-    return (tensor * cos_pos) + (rotate_every_two(tensor) * sin_pos)
+    # sin_pos = sin_pos[:, :, None, :]
+    return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
 
 
 class FlaxLlamaRMSNorm(nn.Module):
@@ -169,8 +180,6 @@ def setup(self):
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
 
-        self.rotary_dim = self.head_dim
-
         dense = partial(
             nn.Dense,
             self.embed_dim,
@@ -184,8 +193,7 @@ def setup(self):
 
         self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
 
-        pos_embd_dim = self.rotary_dim or self.embed_dim
-        self.embed_positions = create_sinusoidal_positions(config.max_position_embeddings, pos_embd_dim)
+        self.embed_positions = create_sinusoidal_positions(config.max_position_embeddings, self.head_dim)
 
     def _split_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
@@ -242,23 +250,24 @@ def __call__(
         key = self._split_heads(key)
         value = self._split_heads(value)
 
-        sincos = jnp.take(self.embed_positions, position_ids, axis=0)
+        # sincos = jnp.take(self.embed_positions, position_ids, axis=1)
+        sincos = self.embed_positions[position_ids]
         sincos = jnp.split(sincos, 2, axis=-1)
-        if self.rotary_dim is not None:
-            k_rot = key[:, :, :, : self.rotary_dim]
-            k_pass = key[:, :, :, self.rotary_dim :]
+        # if self.rotary_dim is not None:
+            # k_rot = key[:, :, :, : self.rotary_dim]
+            # k_pass = key[:, :, :, self.rotary_dim :]
 
-            q_rot = query[:, :, :, : self.rotary_dim]
-            q_pass = query[:, :, :, self.rotary_dim :]
+            # q_rot = query[:, :, :, : self.rotary_dim]
+            # q_pass = query[:, :, :, self.rotary_dim :]
 
-            k_rot = apply_rotary_pos_emb(k_rot, sincos)
-            q_rot = apply_rotary_pos_emb(q_rot, sincos)
+            # k_rot = apply_rotary_pos_emb(k_rot, sincos)
+            # q_rot = apply_rotary_pos_emb(q_rot, sincos)
 
-            key = jnp.concatenate([k_rot, k_pass], axis=-1)
-            query = jnp.concatenate([q_rot, q_pass], axis=-1)
-        else:
-            key = apply_rotary_pos_emb(key, sincos)
-            query = apply_rotary_pos_emb(query, sincos)
+            # key = jnp.concatenate([k_rot, k_pass], axis=-1)
+            # query = jnp.concatenate([q_rot, q_pass], axis=-1)
+        # else:
+        key = apply_rotary_pos_emb(key, sincos)
+        query = apply_rotary_pos_emb(query, sincos)
 
         query_length, key_length = query.shape[1], key.shape[1]
 
@@ -727,6 +736,22 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     from .modeling_llama import LlamaDecoderLayer, _make_causal_mask
     import torch
 
+    # from .modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb
+    # x = torch.randn(4, 64, 32)
+    # pt_x = x.view(4, 64, 2, 16).transpose(1, 2)
+    # layer = LlamaRotaryEmbedding(16, max_position_embeddings=64)
+    # cos, sin = layer(x, x.shape[-2])
+    # pt_y = apply_rotary_pos_emb(pt_x, pt_x, cos, sin, None)[0]
+
+    # from .modeling_flax_llama import create_sinusoidal_positions, apply_rotary_pos_emb
+    # sincos = create_sinusoidal_positions(64, 16)
+    
+    # sincos = jnp.split(sincos, 2, axis=-1)
+    # y = apply_rotary_pos_emb(x.detach().numpy().reshape(x.shape[:2] + (2, 16)), sincos)
+
+    # import ipdb; ipdb.set_trace()
+    # exit()
+
     key = jax.random.PRNGKey(0)
     torch.manual_seed(0)
     config = LlamaConfig()
@@ -740,9 +765,9 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(4, axis=0)
 
     key, model_key = jax.random.split(key)
-    params = model.init(model_key, x, mask, position_ids)
+    params = model.init(model_key, x, attention_mask=mask, position_ids=position_ids)
 
-    y, = model.apply(params, x, mask, position_ids)
+    y, = model.apply(params, x, attention_mask=mask, position_ids=position_ids)
 
     params = flatten_dict(params['params'], sep='.')
     pt_state = pt_model.state_dict()
@@ -759,7 +784,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         'mlp.gate_proj.weight': torch.from_numpy(np.asarray(params['mlp.gate_proj.kernel'])).T,
     })
     x = torch.tensor(np.asarray(x))
-    pt_y = pt_model(x, _make_causal_mask((4, 128), torch.float32, device='cpu'), torch.from_numpy(np.asarray(position_ids)))[0]
+    pt_y = pt_model(x, attention_mask=_make_causal_mask((4, 128), torch.float32, device='cpu'), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
 
     y = np.asarray(y)
     pt_y = pt_y.detach().numpy()

From 5177bfdf9dc6569274b3c90232a9b34c6097613d Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 27 Jun 2023 07:34:38 +0100
Subject: [PATCH 08/87] fixes bug with decoder layer still somewhat numerically
 inaccurate, but close enough for now

---
 .../models/llama/modeling_flax_llama.py           | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index a743d50c023d96..8ec73e340bba8e 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -305,7 +305,6 @@ def __call__(
             bias=attention_bias,
             deterministic=deterministic,
             dtype=self.dtype,
-            precision=None,
         )
 
         attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
@@ -355,7 +354,7 @@ def __call__(
         self,
         hidden_states,
         position_ids = None,
-        attention_mask=None,
+        attention_mask = None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -372,14 +371,14 @@ def __call__(
         )
         # residual connection
         attn_output = outputs[0]
-        hidden_states = attn_output + residual
+        hidden_states = residual + attn_output
 
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
-        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = self.mlp(hidden_states)
         # residual connection
         hidden_states = residual + hidden_states
-
+        
         return (hidden_states,) + outputs[1:]
 
 
@@ -765,9 +764,9 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(4, axis=0)
 
     key, model_key = jax.random.split(key)
-    params = model.init(model_key, x, attention_mask=mask, position_ids=position_ids)
+    y, params = model.init_with_output(model_key, x, attention_mask=mask, position_ids=position_ids)
 
-    y, = model.apply(params, x, attention_mask=mask, position_ids=position_ids)
+    # y, = model.apply(params, x, attention_mask=mask, position_ids=position_ids)
 
     params = flatten_dict(params['params'], sep='.')
     pt_state = pt_model.state_dict()
@@ -786,7 +785,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     x = torch.tensor(np.asarray(x))
     pt_y = pt_model(x, attention_mask=_make_causal_mask((4, 128), torch.float32, device='cpu'), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
 
-    y = np.asarray(y)
+    y = np.asarray(y[0])
     pt_y = pt_y.detach().numpy()
 
     try:

From 578e0d98d4fc6ca894aa7222f0bb946669ae27ad Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 27 Jun 2023 08:00:07 +0100
Subject: [PATCH 09/87] adds markers for what to implement next the structure
 here diverges a lot from the PT version. not a big fan of it, but just get
 something working for now

---
 src/transformers/models/llama/modeling_flax_llama.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 8ec73e340bba8e..b2812c1358fa5e 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -336,7 +336,6 @@ def __call__(self, hidden_states):
 
 
 # TODO: make sure attention output format is same as Pytorch
-# for now, we just worry about model numerics
 class FlaxLlamaDecoderLayer(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
@@ -382,6 +381,7 @@ def __call__(
         return (hidden_states,) + outputs[1:]
 
 
+# TODO: check this is ported
 class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -514,6 +514,7 @@ def __call__(
         return outputs
 
 
+# TODO: implement
 class FlaxLlamaBlockCollection(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
@@ -559,6 +560,7 @@ def __call__(
         return outputs
 
 
+# TODO: implement
 class FlaxLlamaModule(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
@@ -571,11 +573,6 @@ def setup(self):
             self.embed_dim,
             embedding_init=embedding_init,
         )
-        self.wpe = nn.Embed(
-            self.config.max_position_embeddings,
-            self.embed_dim,
-            embedding_init=embedding_init,
-        )
         self.dropout = nn.Dropout(rate=self.config.embed_dropout)
         self.h = FlaxLlamaBlockCollection(self.config, dtype=self.dtype)
         self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
@@ -636,10 +633,10 @@ def __call__(
 class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
     module_class = FlaxLlamaModule
 
-
 # append_call_sample_docstring(FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
 
 
+# TODO: implement
 class FlaxLlamaForCausalLMModule(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32

From 9f74d838c48fa8382b55b5b571772406efd20f4f Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Mon, 10 Jul 2023 17:16:14 +0100
Subject: [PATCH 10/87] implements `FlaxLlamaBlockCollection`] tolerance must
 be higher than expected, kinda disconcerting

---
 .../models/llama/modeling_flax_llama.py       | 60 +++++++++----------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index b2812c1358fa5e..d0afaacc433423 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -521,30 +521,26 @@ class FlaxLlamaBlockCollection(nn.Module):
 
     def setup(self):
         self.blocks = [
-            FlaxLlamaBlock(self.config, layer_id=i, name=str(i), dtype=self.dtype)
+            FlaxLlamaDecoderLayer(self.config, dtype=self.dtype, name=str(i))
             for i in range(self.config.num_hidden_layers)
         ]
 
     def __call__(
         self,
         hidden_states,
+        position_ids=None,
         attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
     ):
         all_attentions = () if output_attentions else None
-        all_hidden_states = () if output_hidden_states else None
 
         for block in self.blocks:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
             layer_outputs = block(
                 hidden_states,
-                attention_mask,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
                 deterministic=deterministic,
                 init_cache=init_cache,
                 output_attentions=output_attentions,
@@ -555,7 +551,7 @@ def __call__(
                 all_attentions += (layer_outputs[1],)
 
         # this contains possible `None` values - `FlaxLlamaModule` will filter them out
-        outputs = (hidden_states, all_hidden_states, all_attentions)
+        outputs = (hidden_states, all_attentions)
 
         return outputs
 
@@ -729,7 +725,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     import flax
     from flax.traverse_util import flatten_dict
     from .configuration_llama import LlamaConfig
-    from .modeling_llama import LlamaDecoderLayer, _make_causal_mask
+    from .modeling_llama import LlamaModel, _make_causal_mask
     import torch
 
     # from .modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb
@@ -750,10 +746,10 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     key = jax.random.PRNGKey(0)
     torch.manual_seed(0)
-    config = LlamaConfig()
+    config = LlamaConfig(num_hidden_layers=3)
 
-    model = FlaxLlamaDecoderLayer(config)
-    pt_model = LlamaDecoderLayer(config)
+    model = FlaxLlamaBlockCollection(config)
+    pt_model = LlamaModel(config).layers
 
     key, subkey = jax.random.split(key)
     x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 0.1
@@ -766,27 +762,31 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     # y, = model.apply(params, x, attention_mask=mask, position_ids=position_ids)
 
     params = flatten_dict(params['params'], sep='.')
-    pt_state = pt_model.state_dict()
-    pt_model.load_state_dict({
-        'self_attn.q_proj.weight': torch.from_numpy(np.asarray(params['self_attn.q_proj.kernel'])).T,
-        'self_attn.k_proj.weight': torch.from_numpy(np.asarray(params['self_attn.k_proj.kernel'])).T,
-        'self_attn.v_proj.weight': torch.from_numpy(np.asarray(params['self_attn.v_proj.kernel'])).T,
-        'self_attn.o_proj.weight': torch.from_numpy(np.asarray(params['self_attn.o_proj.kernel'])).T,
-        'self_attn.rotary_emb.inv_freq': pt_state['self_attn.rotary_emb.inv_freq'],
-        'input_layernorm.weight': torch.from_numpy(np.asarray(params['input_layernorm.weight'])),
-        'post_attention_layernorm.weight': torch.from_numpy(np.asarray(params['post_attention_layernorm.weight'])),
-        'mlp.down_proj.weight': torch.from_numpy(np.asarray(params['mlp.down_proj.kernel'])).T,
-        'mlp.up_proj.weight': torch.from_numpy(np.asarray(params['mlp.up_proj.kernel'])).T,
-        'mlp.gate_proj.weight': torch.from_numpy(np.asarray(params['mlp.gate_proj.kernel'])).T,
-    })
-    x = torch.tensor(np.asarray(x))
-    pt_y = pt_model(x, attention_mask=_make_causal_mask((4, 128), torch.float32, device='cpu'), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
+
+    for i, l in enumerate(pt_model):
+        pt_state = l.state_dict()
+        l.load_state_dict({
+            'self_attn.q_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.self_attn.q_proj.kernel'])).T,
+            'self_attn.k_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.self_attn.k_proj.kernel'])).T,
+            'self_attn.v_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.self_attn.v_proj.kernel'])).T,
+            'self_attn.o_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.self_attn.o_proj.kernel'])).T,
+            'self_attn.rotary_emb.inv_freq': pt_state[f'self_attn.rotary_emb.inv_freq'],
+            'input_layernorm.weight': torch.from_numpy(np.asarray(params[f'{i}.input_layernorm.weight'])),
+            'post_attention_layernorm.weight': torch.from_numpy(np.asarray(params[f'{i}.post_attention_layernorm.weight'])),
+            'mlp.down_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.mlp.down_proj.kernel'])).T,
+            'mlp.up_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.mlp.up_proj.kernel'])).T,
+            'mlp.gate_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.mlp.gate_proj.kernel'])).T,
+        })
+
+    h = torch.tensor(np.asarray(x))
+    for l in pt_model:
+        h = l(h, attention_mask=_make_causal_mask((4, 128), torch.float32, device='cpu'), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
 
     y = np.asarray(y[0])
-    pt_y = pt_y.detach().numpy()
+    pt_y = h.detach().numpy()
 
     try:
-        np.testing.assert_allclose(y, pt_y, atol=1e-3, rtol=1e-3)
+        np.testing.assert_allclose(y, pt_y, atol=1e-2, rtol=1e-2)
     except AssertionError as e:
         print(e)
         import ipdb; ipdb.set_trace()

From a60b00ff202e84239025dd610b41aafb3b7a0cb2 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 11 Jul 2023 21:55:36 +0100
Subject: [PATCH 11/87] =?UTF-8?q?Adds=20`FlaxLlamaModule`=20equivalent=20P?=
 =?UTF-8?q?yTorch=20model=20is=20`LlamaModel`=20yay!=20a=20language=20mode?=
 =?UTF-8?q?l=F0=9F=A4=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/llama/modeling_flax_llama.py       | 122 +++++++++---------
 1 file changed, 64 insertions(+), 58 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index d0afaacc433423..f63859178235c8 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -335,7 +335,6 @@ def __call__(self, hidden_states):
         return hidden_states
 
 
-# TODO: make sure attention output format is same as Pytorch
 class FlaxLlamaDecoderLayer(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
@@ -514,7 +513,6 @@ def __call__(
         return outputs
 
 
-# TODO: implement
 class FlaxLlamaBlockCollection(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
@@ -556,70 +554,66 @@ def __call__(
         return outputs
 
 
-# TODO: implement
 class FlaxLlamaModule(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
-        self.embed_dim = self.config.hidden_size
+        self.hidden_size = self.config.hidden_size
         embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
-        self.wte = nn.Embed(
+        self.embed_tokens = nn.Embed(
             self.config.vocab_size,
-            self.embed_dim,
+            self.hidden_size,
             embedding_init=embedding_init,
         )
-        self.dropout = nn.Dropout(rate=self.config.embed_dropout)
-        self.h = FlaxLlamaBlockCollection(self.config, dtype=self.dtype)
-        self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype)
+        self.layers = FlaxLlamaBlockCollection(self.config, dtype=self.dtype)
+        self.norm = FlaxLlamaRMSNorm(self.config.rms_norm_eps)
 
     def __call__(
         self,
         input_ids,
-        attention_mask,
-        position_ids,
+        position_ids=None,
+        attention_mask=None,
         deterministic=True,
         init_cache: bool = False,
         output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
+        # TODO: implement these args
+        # output_hidden_states: bool = False,
+        # return_dict: bool = True,
     ):
-        input_embeds = self.wte(input_ids.astype("i4"))
-        position_embeds = self.wpe(position_ids.astype("i4"))
-
-        hidden_states = input_embeds + position_embeds
-        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        input_embeds = self.embed_tokens(input_ids.astype("i4"))
 
-        outputs = self.h(
-            hidden_states,
-            attention_mask,
+        outputs = self.layers(
+            input_embeds,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            # output_hidden_states=output_hidden_states,
+            # return_dict=return_dict,
         )
 
         hidden_states = outputs[0]
-        hidden_states = self.ln_f(hidden_states)
+        hidden_states = self.norm(hidden_states)
 
-        hidden_states = outputs[0]
-        hidden_states = self.ln_f(hidden_states)
+        # TODO: implement this
+        # if output_hidden_states:
+            # all_hidden_states = outputs[1] + (hidden_states,)
+            # outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        # else:
+            # outputs = (hidden_states,) + outputs[1:]
 
-        if output_hidden_states:
-            all_hidden_states = outputs[1] + (hidden_states,)
-            outputs = (hidden_states, all_hidden_states) + outputs[2:]
-        else:
-            outputs = (hidden_states,) + outputs[1:]
+        # if not return_dict:
+            # return tuple(v for v in outputs if v is not None)
 
-        if not return_dict:
-            return tuple(v for v in outputs if v is not None)
+        # return FlaxBaseModelOutput(
+            # last_hidden_state=hidden_states,
+            # hidden_states=outputs[1],
+            # attentions=outputs[-1],
+        # )
 
-        return FlaxBaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=outputs[1],
-            attentions=outputs[-1],
-        )
+        return hidden_states
 
 
 @add_start_docstrings(
@@ -728,6 +722,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     from .modeling_llama import LlamaModel, _make_causal_mask
     import torch
 
+
     # from .modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb
     # x = torch.randn(4, 64, 32)
     # pt_x = x.view(4, 64, 2, 16).transpose(1, 2)
@@ -746,13 +741,15 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     key = jax.random.PRNGKey(0)
     torch.manual_seed(0)
-    config = LlamaConfig(num_hidden_layers=3)
+    config = LlamaConfig(num_hidden_layers=2, vocab_size=16)
+    print(config)
 
-    model = FlaxLlamaBlockCollection(config)
-    pt_model = LlamaModel(config).layers
+    model = FlaxLlamaModule(config)
+    pt_model = LlamaModel(config)
 
     key, subkey = jax.random.split(key)
-    x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 0.1
+    # x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 0.1
+    x = jax.random.randint(subkey, (4, 128), 0, 16)
     mask = jnp.ones((4, 128), dtype=bool)
     position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(4, axis=0)
 
@@ -763,27 +760,37 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     params = flatten_dict(params['params'], sep='.')
 
-    for i, l in enumerate(pt_model):
+    # import ipdb; ipdb.set_trace()
+
+    for i, l in enumerate(pt_model.layers):
         pt_state = l.state_dict()
         l.load_state_dict({
-            'self_attn.q_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.self_attn.q_proj.kernel'])).T,
-            'self_attn.k_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.self_attn.k_proj.kernel'])).T,
-            'self_attn.v_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.self_attn.v_proj.kernel'])).T,
-            'self_attn.o_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.self_attn.o_proj.kernel'])).T,
+            'self_attn.q_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.self_attn.q_proj.kernel'])).T,
+            'self_attn.k_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.self_attn.k_proj.kernel'])).T,
+            'self_attn.v_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.self_attn.v_proj.kernel'])).T,
+            'self_attn.o_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.self_attn.o_proj.kernel'])).T,
             'self_attn.rotary_emb.inv_freq': pt_state[f'self_attn.rotary_emb.inv_freq'],
-            'input_layernorm.weight': torch.from_numpy(np.asarray(params[f'{i}.input_layernorm.weight'])),
-            'post_attention_layernorm.weight': torch.from_numpy(np.asarray(params[f'{i}.post_attention_layernorm.weight'])),
-            'mlp.down_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.mlp.down_proj.kernel'])).T,
-            'mlp.up_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.mlp.up_proj.kernel'])).T,
-            'mlp.gate_proj.weight': torch.from_numpy(np.asarray(params[f'{i}.mlp.gate_proj.kernel'])).T,
+            'input_layernorm.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.input_layernorm.weight'])),
+            'post_attention_layernorm.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.post_attention_layernorm.weight'])),
+            'mlp.down_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.mlp.down_proj.kernel'])).T,
+            'mlp.up_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.mlp.up_proj.kernel'])).T,
+            'mlp.gate_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.mlp.gate_proj.kernel'])).T,
         })
 
-    h = torch.tensor(np.asarray(x))
-    for l in pt_model:
-        h = l(h, attention_mask=_make_causal_mask((4, 128), torch.float32, device='cpu'), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
+    pt_model.embed_tokens.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['embed_tokens.embedding'])))
+    pt_model.norm.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['norm.weight'])))
+
+    x_pt = torch.tensor(np.asarray(x))
+    # for l in pt_model:
+        # h = l(h, attention_mask=_make_causal_mask((4, 128), torch.float32, device='cpu'), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
 
-    y = np.asarray(y[0])
-    pt_y = h.detach().numpy()
+    # y = pt_model(x_pt, attention_mask=_make_causal_mask((4, 128), torch.float32, device='cpu'), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
+    pt_y = pt_model(x_pt, attention_mask=torch.from_numpy(np.asarray(mask)), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
+
+
+    pt_y = pt_y.detach().numpy()
+    #
+    # pt_y = h.detach().numpy()
 
     try:
         np.testing.assert_allclose(y, pt_y, atol=1e-2, rtol=1e-2)
@@ -792,5 +799,4 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         import ipdb; ipdb.set_trace()
 
 
-    print(config)
     print("done")

From c7ac55b3036d2107ac98fe2c406db1220491bcb1 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Wed, 12 Jul 2023 20:11:18 +0100
Subject: [PATCH 12/87] adds `FlaxLlamaForCausalLMModule` equivalent to
 `LlamaForCausalLM` still missing returning dict or tuple, will add later

---
 .../models/llama/modeling_flax_llama.py       | 73 +++++++++----------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index f63859178235c8..398718f29a0157 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -626,13 +626,12 @@ class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
 # append_call_sample_docstring(FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
 
 
-# TODO: implement
 class FlaxLlamaForCausalLMModule(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
-        self.transformer = FlaxLlamaModule(self.config, dtype=self.dtype)
+        self.model = FlaxLlamaModule(self.config, dtype=self.dtype)
         self.lm_head = nn.Dense(
             self.config.vocab_size,
             use_bias=False,
@@ -643,37 +642,36 @@ def setup(self):
     def __call__(
         self,
         input_ids,
-        attention_mask,
-        position_ids,
+        position_ids=None,
+        attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
-        output_hidden_states: bool = False,
-        return_dict: bool = True,
+        # output_hidden_states: bool = False,
+        # return_dict: bool = True,
     ):
-        outputs = self.transformer(
+        outputs = self.model(
             input_ids,
-            attention_mask,
-            position_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+            # output_hidden_states=output_hidden_states,
+            # return_dict=return_dict,
         )
 
-        hidden_states = outputs[0]
+        # TODO: add this back when we return `FlaxBaseModelOutput`
+        # hidden_states = outputs[0]
+        hidden_states = outputs
+        lm_logits = self.lm_head(hidden_states)
 
-        if self.config.tie_word_embeddings:
-            shared_kernel = self.transformer.variables["params"]["wte"]["embedding"].T
-            lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states)
-        else:
-            lm_logits = self.lm_head(hidden_states)
-
-        if not return_dict:
-            return (lm_logits,) + outputs[1:]
+        # if not return_dict:
+            # return (lm_logits,) + outputs[1:]
 
-        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+        # TODO: return FlaxCausalLMOutput
+        # return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+        return lm_logits
 
 
 @add_start_docstrings(
@@ -719,7 +717,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     import flax
     from flax.traverse_util import flatten_dict
     from .configuration_llama import LlamaConfig
-    from .modeling_llama import LlamaModel, _make_causal_mask
+    from .modeling_llama import LlamaForCausalLM, _make_causal_mask
     import torch
 
 
@@ -744,8 +742,8 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     config = LlamaConfig(num_hidden_layers=2, vocab_size=16)
     print(config)
 
-    model = FlaxLlamaModule(config)
-    pt_model = LlamaModel(config)
+    model = FlaxLlamaForCausalLMModule(config)
+    pt_model = LlamaForCausalLM(config)
 
     key, subkey = jax.random.split(key)
     # x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 0.1
@@ -760,25 +758,26 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     params = flatten_dict(params['params'], sep='.')
 
-    # import ipdb; ipdb.set_trace()
+    import ipdb; ipdb.set_trace()
 
-    for i, l in enumerate(pt_model.layers):
+    for i, l in enumerate(pt_model.model.layers):
         pt_state = l.state_dict()
         l.load_state_dict({
-            'self_attn.q_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.self_attn.q_proj.kernel'])).T,
-            'self_attn.k_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.self_attn.k_proj.kernel'])).T,
-            'self_attn.v_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.self_attn.v_proj.kernel'])).T,
-            'self_attn.o_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.self_attn.o_proj.kernel'])).T,
+            'self_attn.q_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.self_attn.q_proj.kernel'])).T,
+            'self_attn.k_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.self_attn.k_proj.kernel'])).T,
+            'self_attn.v_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.self_attn.v_proj.kernel'])).T,
+            'self_attn.o_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.self_attn.o_proj.kernel'])).T,
             'self_attn.rotary_emb.inv_freq': pt_state[f'self_attn.rotary_emb.inv_freq'],
-            'input_layernorm.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.input_layernorm.weight'])),
-            'post_attention_layernorm.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.post_attention_layernorm.weight'])),
-            'mlp.down_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.mlp.down_proj.kernel'])).T,
-            'mlp.up_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.mlp.up_proj.kernel'])).T,
-            'mlp.gate_proj.weight': torch.from_numpy(np.asarray(params[f'layers.{i}.mlp.gate_proj.kernel'])).T,
+            'input_layernorm.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.input_layernorm.weight'])),
+            'post_attention_layernorm.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.post_attention_layernorm.weight'])),
+            'mlp.down_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.mlp.down_proj.kernel'])).T,
+            'mlp.up_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.mlp.up_proj.kernel'])).T,
+            'mlp.gate_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.mlp.gate_proj.kernel'])).T,
         })
 
-    pt_model.embed_tokens.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['embed_tokens.embedding'])))
-    pt_model.norm.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['norm.weight'])))
+    pt_model.model.embed_tokens.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['model.embed_tokens.embedding'])))
+    pt_model.model.norm.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['model.norm.weight'])))
+    pt_model.lm_head.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['lm_head.kernel'].T)))
 
     x_pt = torch.tensor(np.asarray(x))
     # for l in pt_model:

From b6dff5a10d88ea1893085bbd3ca929f3d26855ec Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Wed, 12 Jul 2023 20:31:32 +0100
Subject: [PATCH 13/87] start porting pretrained wrappers realised it probably
 needs return dict as a prereq

---
 .../models/llama/modeling_flax_llama.py       | 41 ++++---------------
 1 file changed, 9 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 398718f29a0157..c94b99a6cd9aca 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -411,7 +411,9 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
 
-        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
+        # TODO: add return_dict
+        # random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False)["params"]
+        random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask)["params"]
 
         if params is not None:
             random_params = flatten_dict(unfreeze(random_params))
@@ -437,8 +439,11 @@ def init_cache(self, batch_size, max_length):
         attention_mask = jnp.ones_like(input_ids)
         position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
 
+        # init_variables = self.module.init(
+            # jax.random.PRNGKey(0), input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False, init_cache=True
+        # )
         init_variables = self.module.init(
-            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+            jax.random.PRNGKey(0), input_ids, position_ids=position_ids, attention_mask=attention_mask, init_cache=True
         )
         return unfreeze(init_variables["cache"])
 
@@ -446,8 +451,8 @@ def init_cache(self, batch_size, max_length):
     def __call__(
         self,
         input_ids,
-        attention_mask=None,
         position_ids=None,
+        attention_mask=None,
         params: dict = None,
         past_key_values: dict = None,
         dropout_rng: jax.random.PRNGKey = None,
@@ -490,8 +495,8 @@ def __call__(
         outputs = self.module.apply(
             inputs,
             jnp.array(input_ids, dtype="i4"),
-            jnp.array(attention_mask, dtype="i4"),
             jnp.array(position_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
             not train,
             False,
             output_attentions,
@@ -720,23 +725,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     from .modeling_llama import LlamaForCausalLM, _make_causal_mask
     import torch
 
-
-    # from .modeling_llama import LlamaRotaryEmbedding, apply_rotary_pos_emb
-    # x = torch.randn(4, 64, 32)
-    # pt_x = x.view(4, 64, 2, 16).transpose(1, 2)
-    # layer = LlamaRotaryEmbedding(16, max_position_embeddings=64)
-    # cos, sin = layer(x, x.shape[-2])
-    # pt_y = apply_rotary_pos_emb(pt_x, pt_x, cos, sin, None)[0]
-
-    # from .modeling_flax_llama import create_sinusoidal_positions, apply_rotary_pos_emb
-    # sincos = create_sinusoidal_positions(64, 16)
-    
-    # sincos = jnp.split(sincos, 2, axis=-1)
-    # y = apply_rotary_pos_emb(x.detach().numpy().reshape(x.shape[:2] + (2, 16)), sincos)
-
-    # import ipdb; ipdb.set_trace()
-    # exit()
-
     key = jax.random.PRNGKey(0)
     torch.manual_seed(0)
     config = LlamaConfig(num_hidden_layers=2, vocab_size=16)
@@ -746,7 +734,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     pt_model = LlamaForCausalLM(config)
 
     key, subkey = jax.random.split(key)
-    # x = jax.random.normal(subkey, (4, 128, config.hidden_size)) * 0.1
     x = jax.random.randint(subkey, (4, 128), 0, 16)
     mask = jnp.ones((4, 128), dtype=bool)
     position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(4, axis=0)
@@ -754,12 +741,8 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     key, model_key = jax.random.split(key)
     y, params = model.init_with_output(model_key, x, attention_mask=mask, position_ids=position_ids)
 
-    # y, = model.apply(params, x, attention_mask=mask, position_ids=position_ids)
-
     params = flatten_dict(params['params'], sep='.')
 
-    import ipdb; ipdb.set_trace()
-
     for i, l in enumerate(pt_model.model.layers):
         pt_state = l.state_dict()
         l.load_state_dict({
@@ -780,16 +763,10 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     pt_model.lm_head.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['lm_head.kernel'].T)))
 
     x_pt = torch.tensor(np.asarray(x))
-    # for l in pt_model:
-        # h = l(h, attention_mask=_make_causal_mask((4, 128), torch.float32, device='cpu'), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
-
-    # y = pt_model(x_pt, attention_mask=_make_causal_mask((4, 128), torch.float32, device='cpu'), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
     pt_y = pt_model(x_pt, attention_mask=torch.from_numpy(np.asarray(mask)), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
 
 
     pt_y = pt_y.detach().numpy()
-    #
-    # pt_y = h.detach().numpy()
 
     try:
         np.testing.assert_allclose(y, pt_y, atol=1e-2, rtol=1e-2)

From 7bf356758b99c1eec154c3762eead0a6daa041bd Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 14 Jul 2023 15:43:47 +0100
Subject: [PATCH 14/87] cleanup, quality, style

---
 .../models/llama/modeling_flax_llama.py       | 143 +++++++++---------
 1 file changed, 69 insertions(+), 74 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index c94b99a6cd9aca..8ffce7ef5a2f4a 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -26,7 +26,6 @@
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax import lax
 
-from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_llama import LlamaConfig
@@ -115,40 +114,24 @@
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 
+
 def create_sinusoidal_positions(num_pos, dim):
     inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
     freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
 
-    # sin, cos = np.sin(freqs), np.cos(freqs)
-    # sentinel = dim // 2 + dim % 2
-    # out = np.zeros((num_pos, dim))
-    # out[:, 0:sentinel] = sin
-    # out[:, sentinel:] = cos
-
     emb = np.concatenate((freqs, freqs), axis=-1)
-
-    out = np.concatenate((
-        np.sin(emb)[:, None, :],
-        np.cos(emb)[:, None, :]
-    ), axis=-1)
-    return jnp.array(out[:, :, :num_pos]) # TODO: don't think slice is needed
+    out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1)
+    return jnp.array(out[:, :, :num_pos])  # TODO: don't think slice is needed
 
 
-# def rotate_every_two(tensor):
-    # rotate_half_tensor = jnp.stack((-tensor[:, :, :, 1::2], tensor[:, :, :, ::2]), axis=-1)
-    # rotate_half_tensor = rotate_half_tensor.reshape(rotate_half_tensor.shape[:-2] + (-1,))
-    # return rotate_half_tensor
-
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
-    rotate_half_tensor = jnp.concatenate((-x[..., x.shape[-1] // 2:], x[..., :x.shape[-1] // 2]), axis=-1)
-    # rotate_half_tensor = rotate_half_tensor.reshape(rotate_half_tensor.shape[:-2] + (-1,))
+    rotate_half_tensor = jnp.concatenate((-x[..., x.shape[-1] // 2 :], x[..., : x.shape[-1] // 2]), axis=-1)
     return rotate_half_tensor
 
 
 def apply_rotary_pos_emb(tensor, sincos):
     sin_pos, cos_pos = sincos
-    # sin_pos = sin_pos[:, :, None, :]
     return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
 
 
@@ -163,7 +146,7 @@ def __call__(self, hidden_states):
         variance = variance.mean(-1, keepdims=True)
         hidden_states = hidden_states * jax.lax.rsqrt(variance + self.eps)
 
-        weight = self.param('weight', lambda _, shape: jnp.ones(shape), hidden_states.shape[-1])
+        weight = self.param("weight", lambda _, shape: jnp.ones(shape), hidden_states.shape[-1])
 
         return jnp.asarray(weight * hidden_states, dtype=input_dtype)
 
@@ -185,14 +168,13 @@ def setup(self):
             self.embed_dim,
             use_bias=False,
             dtype=self.dtype,
-            # kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
         )
 
         self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
         self.o_proj = dense()
 
         self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
-
         self.embed_positions = create_sinusoidal_positions(config.max_position_embeddings, self.head_dim)
 
     def _split_heads(self, hidden_states):
@@ -250,22 +232,9 @@ def __call__(
         key = self._split_heads(key)
         value = self._split_heads(value)
 
-        # sincos = jnp.take(self.embed_positions, position_ids, axis=1)
         sincos = self.embed_positions[position_ids]
         sincos = jnp.split(sincos, 2, axis=-1)
-        # if self.rotary_dim is not None:
-            # k_rot = key[:, :, :, : self.rotary_dim]
-            # k_pass = key[:, :, :, self.rotary_dim :]
 
-            # q_rot = query[:, :, :, : self.rotary_dim]
-            # q_pass = query[:, :, :, self.rotary_dim :]
-
-            # k_rot = apply_rotary_pos_emb(k_rot, sincos)
-            # q_rot = apply_rotary_pos_emb(q_rot, sincos)
-
-            # key = jnp.concatenate([k_rot, k_pass], axis=-1)
-            # query = jnp.concatenate([q_rot, q_pass], axis=-1)
-        # else:
         key = apply_rotary_pos_emb(key, sincos)
         query = apply_rotary_pos_emb(query, sincos)
 
@@ -351,8 +320,8 @@ def setup(self):
     def __call__(
         self,
         hidden_states,
-        position_ids = None,
-        attention_mask = None,
+        position_ids=None,
+        attention_mask=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -376,7 +345,7 @@ def __call__(
         hidden_states = self.mlp(hidden_states)
         # residual connection
         hidden_states = residual + hidden_states
-        
+
         return (hidden_states,) + outputs[1:]
 
 
@@ -413,7 +382,9 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
 
         # TODO: add return_dict
         # random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False)["params"]
-        random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask)["params"]
+        random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask)[
+            "params"
+        ]
 
         if params is not None:
             random_params = flatten_dict(unfreeze(random_params))
@@ -440,7 +411,7 @@ def init_cache(self, batch_size, max_length):
         position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
 
         # init_variables = self.module.init(
-            # jax.random.PRNGKey(0), input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False, init_cache=True
+        # jax.random.PRNGKey(0), input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False, init_cache=True
         # )
         init_variables = self.module.init(
             jax.random.PRNGKey(0), input_ids, position_ids=position_ids, attention_mask=attention_mask, init_cache=True
@@ -604,18 +575,18 @@ def __call__(
 
         # TODO: implement this
         # if output_hidden_states:
-            # all_hidden_states = outputs[1] + (hidden_states,)
-            # outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        # all_hidden_states = outputs[1] + (hidden_states,)
+        # outputs = (hidden_states, all_hidden_states) + outputs[2:]
         # else:
-            # outputs = (hidden_states,) + outputs[1:]
+        # outputs = (hidden_states,) + outputs[1:]
 
         # if not return_dict:
-            # return tuple(v for v in outputs if v is not None)
+        # return tuple(v for v in outputs if v is not None)
 
         # return FlaxBaseModelOutput(
-            # last_hidden_state=hidden_states,
-            # hidden_states=outputs[1],
-            # attentions=outputs[-1],
+        # last_hidden_state=hidden_states,
+        # hidden_states=outputs[1],
+        # attentions=outputs[-1],
         # )
 
         return hidden_states
@@ -628,6 +599,7 @@ def __call__(
 class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
     module_class = FlaxLlamaModule
 
+
 # append_call_sample_docstring(FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
 
 
@@ -672,7 +644,7 @@ def __call__(
         lm_logits = self.lm_head(hidden_states)
 
         # if not return_dict:
-            # return (lm_logits,) + outputs[1:]
+        # return (lm_logits,) + outputs[1:]
 
         # TODO: return FlaxCausalLMOutput
         # return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
@@ -719,12 +691,11 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 # append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
 
 if __name__ == "__main__":
-    import flax
-    from flax.traverse_util import flatten_dict
-    from .configuration_llama import LlamaConfig
-    from .modeling_llama import LlamaForCausalLM, _make_causal_mask
     import torch
 
+    from .configuration_llama import LlamaConfig
+    from .modeling_llama import LlamaForCausalLM
+
     key = jax.random.PRNGKey(0)
     torch.manual_seed(0)
     config = LlamaConfig(num_hidden_layers=2, vocab_size=16)
@@ -741,30 +712,53 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     key, model_key = jax.random.split(key)
     y, params = model.init_with_output(model_key, x, attention_mask=mask, position_ids=position_ids)
 
-    params = flatten_dict(params['params'], sep='.')
+    params = flatten_dict(params["params"], sep=".")
 
     for i, l in enumerate(pt_model.model.layers):
         pt_state = l.state_dict()
-        l.load_state_dict({
-            'self_attn.q_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.self_attn.q_proj.kernel'])).T,
-            'self_attn.k_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.self_attn.k_proj.kernel'])).T,
-            'self_attn.v_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.self_attn.v_proj.kernel'])).T,
-            'self_attn.o_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.self_attn.o_proj.kernel'])).T,
-            'self_attn.rotary_emb.inv_freq': pt_state[f'self_attn.rotary_emb.inv_freq'],
-            'input_layernorm.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.input_layernorm.weight'])),
-            'post_attention_layernorm.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.post_attention_layernorm.weight'])),
-            'mlp.down_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.mlp.down_proj.kernel'])).T,
-            'mlp.up_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.mlp.up_proj.kernel'])).T,
-            'mlp.gate_proj.weight': torch.from_numpy(np.asarray(params[f'model.layers.{i}.mlp.gate_proj.kernel'])).T,
-        })
-
-    pt_model.model.embed_tokens.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['model.embed_tokens.embedding'])))
-    pt_model.model.norm.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['model.norm.weight'])))
-    pt_model.lm_head.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params['lm_head.kernel'].T)))
+        l.load_state_dict(
+            {
+                "self_attn.q_proj.weight": torch.from_numpy(
+                    np.asarray(params[f"model.layers.{i}.self_attn.q_proj.kernel"])
+                ).T,
+                "self_attn.k_proj.weight": torch.from_numpy(
+                    np.asarray(params[f"model.layers.{i}.self_attn.k_proj.kernel"])
+                ).T,
+                "self_attn.v_proj.weight": torch.from_numpy(
+                    np.asarray(params[f"model.layers.{i}.self_attn.v_proj.kernel"])
+                ).T,
+                "self_attn.o_proj.weight": torch.from_numpy(
+                    np.asarray(params[f"model.layers.{i}.self_attn.o_proj.kernel"])
+                ).T,
+                "self_attn.rotary_emb.inv_freq": pt_state["self_attn.rotary_emb.inv_freq"],
+                "input_layernorm.weight": torch.from_numpy(
+                    np.asarray(params[f"model.layers.{i}.input_layernorm.weight"])
+                ),
+                "post_attention_layernorm.weight": torch.from_numpy(
+                    np.asarray(params[f"model.layers.{i}.post_attention_layernorm.weight"])
+                ),
+                "mlp.down_proj.weight": torch.from_numpy(
+                    np.asarray(params[f"model.layers.{i}.mlp.down_proj.kernel"])
+                ).T,
+                "mlp.up_proj.weight": torch.from_numpy(np.asarray(params[f"model.layers.{i}.mlp.up_proj.kernel"])).T,
+                "mlp.gate_proj.weight": torch.from_numpy(
+                    np.asarray(params[f"model.layers.{i}.mlp.gate_proj.kernel"])
+                ).T,
+            }
+        )
 
-    x_pt = torch.tensor(np.asarray(x))
-    pt_y = pt_model(x_pt, attention_mask=torch.from_numpy(np.asarray(mask)), position_ids=torch.from_numpy(np.asarray(position_ids)))[0]
+    pt_model.model.embed_tokens.weight = torch.nn.Parameter(
+        torch.from_numpy(np.asarray(params["model.embed_tokens.embedding"]))
+    )
+    pt_model.model.norm.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params["model.norm.weight"])))
+    pt_model.lm_head.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params["lm_head.kernel"].T)))
 
+    x_pt = torch.tensor(np.asarray(x))
+    pt_y = pt_model(
+        x_pt,
+        attention_mask=torch.from_numpy(np.asarray(mask)),
+        position_ids=torch.from_numpy(np.asarray(position_ids)),
+    )[0]
 
     pt_y = pt_y.detach().numpy()
 
@@ -772,7 +766,8 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         np.testing.assert_allclose(y, pt_y, atol=1e-2, rtol=1e-2)
     except AssertionError as e:
         print(e)
-        import ipdb; ipdb.set_trace()
+        import ipdb
 
+        ipdb.set_trace()
 
     print("done")

From 99d40a02d3717acbf79124ae17c7df2af48623bb Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 14 Jul 2023 16:02:12 +0100
Subject: [PATCH 15/87] readds `return_dict` and model output named tuples

---
 .../models/llama/modeling_flax_llama.py       | 74 +++++++++----------
 1 file changed, 34 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 8ffce7ef5a2f4a..6827505762bc6a 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -27,6 +27,7 @@
 from jax import lax
 
 from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_llama import LlamaConfig
 
@@ -381,10 +382,7 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
         rngs = {"params": params_rng, "dropout": dropout_rng}
 
         # TODO: add return_dict
-        # random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False)["params"]
-        random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask)[
-            "params"
-        ]
+        random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False)["params"]
 
         if params is not None:
             random_params = flatten_dict(unfreeze(random_params))
@@ -410,11 +408,8 @@ def init_cache(self, batch_size, max_length):
         attention_mask = jnp.ones_like(input_ids)
         position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
 
-        # init_variables = self.module.init(
-        # jax.random.PRNGKey(0), input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False, init_cache=True
-        # )
         init_variables = self.module.init(
-            jax.random.PRNGKey(0), input_ids, position_ids=position_ids, attention_mask=attention_mask, init_cache=True
+            jax.random.PRNGKey(0), input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False, init_cache=True
         )
         return unfreeze(init_variables["cache"])
 
@@ -507,10 +502,15 @@ def __call__(
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False
     ):
         all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
 
         for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
             layer_outputs = block(
                 hidden_states,
                 attention_mask=attention_mask,
@@ -525,7 +525,7 @@ def __call__(
                 all_attentions += (layer_outputs[1],)
 
         # this contains possible `None` values - `FlaxLlamaModule` will filter them out
-        outputs = (hidden_states, all_attentions)
+        outputs = (hidden_states, all_hidden_states, all_attentions)
 
         return outputs
 
@@ -553,9 +553,8 @@ def __call__(
         deterministic=True,
         init_cache: bool = False,
         output_attentions: bool = False,
-        # TODO: implement these args
-        # output_hidden_states: bool = False,
-        # return_dict: bool = True,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
     ):
         input_embeds = self.embed_tokens(input_ids.astype("i4"))
 
@@ -566,30 +565,28 @@ def __call__(
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
-            # output_hidden_states=output_hidden_states,
-            # return_dict=return_dict,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
         hidden_states = outputs[0]
         hidden_states = self.norm(hidden_states)
 
         # TODO: implement this
-        # if output_hidden_states:
-        # all_hidden_states = outputs[1] + (hidden_states,)
-        # outputs = (hidden_states, all_hidden_states) + outputs[2:]
-        # else:
-        # outputs = (hidden_states,) + outputs[1:]
-
-        # if not return_dict:
-        # return tuple(v for v in outputs if v is not None)
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
 
-        # return FlaxBaseModelOutput(
-        # last_hidden_state=hidden_states,
-        # hidden_states=outputs[1],
-        # attentions=outputs[-1],
-        # )
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
 
-        return hidden_states
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
 
 
 @add_start_docstrings(
@@ -624,8 +621,8 @@ def __call__(
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
-        # output_hidden_states: bool = False,
-        # return_dict: bool = True,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
     ):
         outputs = self.model(
             input_ids,
@@ -634,21 +631,17 @@ def __call__(
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
-            # output_hidden_states=output_hidden_states,
-            # return_dict=return_dict,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
         )
 
-        # TODO: add this back when we return `FlaxBaseModelOutput`
-        # hidden_states = outputs[0]
-        hidden_states = outputs
+        hidden_states = outputs[0]
         lm_logits = self.lm_head(hidden_states)
 
-        # if not return_dict:
-        # return (lm_logits,) + outputs[1:]
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
 
-        # TODO: return FlaxCausalLMOutput
-        # return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
-        return lm_logits
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
 
 
 @add_start_docstrings(
@@ -711,6 +704,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     key, model_key = jax.random.split(key)
     y, params = model.init_with_output(model_key, x, attention_mask=mask, position_ids=position_ids)
+    y = y[0]
 
     params = flatten_dict(params["params"], sep=".")
 

From 4eebae9a2708f7440dea9e677f1ac5ad4e7d4c5a Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 14 Jul 2023 16:33:11 +0100
Subject: [PATCH 16/87] =?UTF-8?q?(tentatively)=20pretrained=20wrappers=20w?=
 =?UTF-8?q?ork=20=F0=9F=94=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/llama/modeling_flax_llama.py            | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 6827505762bc6a..1b7987abc103ba 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -381,7 +381,6 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
 
-        # TODO: add return_dict
         random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False)["params"]
 
         if params is not None:
@@ -572,7 +571,6 @@ def __call__(
         hidden_states = outputs[0]
         hidden_states = self.norm(hidden_states)
 
-        # TODO: implement this
         if output_hidden_states:
             all_hidden_states = outputs[1] + (hidden_states,)
             outputs = (hidden_states, all_hidden_states) + outputs[2:]
@@ -691,10 +689,12 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
     key = jax.random.PRNGKey(0)
     torch.manual_seed(0)
+
     config = LlamaConfig(num_hidden_layers=2, vocab_size=16)
+    model = FlaxLlamaForCausalLM(config)
     print(config)
 
-    model = FlaxLlamaForCausalLMModule(config)
+    model = FlaxLlamaForCausalLM(config)
     pt_model = LlamaForCausalLM(config)
 
     key, subkey = jax.random.split(key)
@@ -703,10 +703,14 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(4, axis=0)
 
     key, model_key = jax.random.split(key)
-    y, params = model.init_with_output(model_key, x, attention_mask=mask, position_ids=position_ids)
+    # y, params = model.init_with_output(model_key, x, attention_mask=mask, position_ids=position_ids)
+    params = model.params
+    # y = model(model_key, x, attention_mask=mask, position_ids=position_ids)
+    y = model(x, attention_mask=mask, position_ids=position_ids)
     y = y[0]
 
-    params = flatten_dict(params["params"], sep=".")
+    # params = flatten_dict(params["params"], sep=".")
+    params = flatten_dict(params, sep=".")
 
     for i, l in enumerate(pt_model.model.layers):
         pt_state = l.state_dict()

From 4bb420678f53f59cf0e4ce31032ca0935831f02d Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sat, 15 Jul 2023 16:39:41 +0100
Subject: [PATCH 17/87] fixes numerical mismatch in `FlaxLlamaRMSNorm` seems
 `jax.lax.rsqrt` does not match `torch.sqrt`. manually computing `1 /
 jax.numpy.sqrt` results in matching values.

---
 .../models/llama/modeling_flax_llama.py            | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 1b7987abc103ba..2c6f70fa989adf 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -143,9 +143,11 @@ class FlaxLlamaRMSNorm(nn.Module):
     def __call__(self, hidden_states):
         input_dtype = hidden_states.dtype
         variance = jnp.asarray(hidden_states, dtype=jnp.float32)
-        variance = jnp.square(variance)
+        variance = jnp.power(variance, 2)
         variance = variance.mean(-1, keepdims=True)
-        hidden_states = hidden_states * jax.lax.rsqrt(variance + self.eps)
+        # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt`
+        # hidden_states = hidden_states * jax.lax.rsqrt(variance + self.eps)
+        hidden_states = hidden_states / jnp.sqrt(variance + self.eps)
 
         weight = self.param("weight", lambda _, shape: jnp.ones(shape), hidden_states.shape[-1])
 
@@ -683,6 +685,8 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 if __name__ == "__main__":
     import torch
+    torch.set_printoptions(precision=8)
+    jnp.set_printoptions(precision=8, floatmode='fixed')
 
     from .configuration_llama import LlamaConfig
     from .modeling_llama import LlamaForCausalLM
@@ -690,7 +694,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     key = jax.random.PRNGKey(0)
     torch.manual_seed(0)
 
-    config = LlamaConfig(num_hidden_layers=2, vocab_size=16)
+    config = LlamaConfig(num_hidden_layers=1, vocab_size=64, hidden_size=64, num_attention_heads=8, max_position_embeddings=128, intermediate_size=256)
     model = FlaxLlamaForCausalLM(config)
     print(config)
 
@@ -698,7 +702,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     pt_model = LlamaForCausalLM(config)
 
     key, subkey = jax.random.split(key)
-    x = jax.random.randint(subkey, (4, 128), 0, 16)
+    x = jax.random.randint(subkey, (4, 128), 0, 64)
     mask = jnp.ones((4, 128), dtype=bool)
     position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(4, axis=0)
 
@@ -761,7 +765,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     pt_y = pt_y.detach().numpy()
 
     try:
-        np.testing.assert_allclose(y, pt_y, atol=1e-2, rtol=1e-2)
+        np.testing.assert_allclose(y, pt_y, atol=1e-6, rtol=1e-6)
     except AssertionError as e:
         print(e)
         import ipdb

From b78671e88c32456f52f6c7f3fd965bd81582b3e7 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sat, 15 Jul 2023 17:58:48 +0100
Subject: [PATCH 18/87] [WIP] debugging numerics

---
 .../models/llama/modeling_flax_llama.py       | 25 ++++++++++++-------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 2c6f70fa989adf..67088c12591386 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -227,6 +227,7 @@ def __call__(
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
+        # mismatch between here...
         query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
@@ -234,6 +235,7 @@ def __call__(
         query = self._split_heads(query)
         key = self._split_heads(key)
         value = self._split_heads(value)
+        # ...and here
 
         sincos = self.embed_positions[position_ids]
         sincos = jnp.split(sincos, 2, axis=-1)
@@ -694,17 +696,19 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     key = jax.random.PRNGKey(0)
     torch.manual_seed(0)
 
-    config = LlamaConfig(num_hidden_layers=1, vocab_size=64, hidden_size=64, num_attention_heads=8, max_position_embeddings=128, intermediate_size=256)
-    model = FlaxLlamaForCausalLM(config)
-    print(config)
+    # config = LlamaConfig(num_hidden_layers=1, vocab_size=64, hidden_size=64, num_attention_heads=8, max_position_embeddings=128, intermediate_size=256)
+    # model = FlaxLlamaForCausalLM(config)
+    model = FlaxLlamaForCausalLM.from_pretrained("openlm-research/open_llama_3b", from_pt=True)
+    # print(config)
 
-    model = FlaxLlamaForCausalLM(config)
-    pt_model = LlamaForCausalLM(config)
+    # model = FlaxLlamaForCausalLM(config)
+    # pt_model = LlamaForCausalLM(config)
 
+    N = 1
     key, subkey = jax.random.split(key)
-    x = jax.random.randint(subkey, (4, 128), 0, 64)
-    mask = jnp.ones((4, 128), dtype=bool)
-    position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(4, axis=0)
+    x = jax.random.randint(subkey, (N, 128), 0, 64)
+    mask = jnp.ones((N, 128), dtype=bool)
+    position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(N, axis=0)
 
     key, model_key = jax.random.split(key)
     # y, params = model.init_with_output(model_key, x, attention_mask=mask, position_ids=position_ids)
@@ -713,6 +717,9 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     y = model(x, attention_mask=mask, position_ids=position_ids)
     y = y[0]
 
+    print(y)
+    exit()
+
     # params = flatten_dict(params["params"], sep=".")
     params = flatten_dict(params, sep=".")
 
@@ -765,7 +772,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     pt_y = pt_y.detach().numpy()
 
     try:
-        np.testing.assert_allclose(y, pt_y, atol=1e-6, rtol=1e-6)
+        np.testing.assert_allclose(y, pt_y, atol=1e-5, rtol=1e-5)
     except AssertionError as e:
         print(e)
         import ipdb

From 3f7bc54ba522ee2e2e0e049dc265aec59febc3f2 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 16 Jul 2023 12:35:29 +0100
Subject: [PATCH 19/87] numerical match I think issue was accidental change of
 backend. forcing CPU fixes test. We expect some mismatch on GPU.

---
 .../models/llama/modeling_flax_llama.py       | 68 +++++++++++--------
 1 file changed, 39 insertions(+), 29 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 67088c12591386..198a6bdd6a1c01 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -26,8 +26,8 @@
 from flax.traverse_util import flatten_dict, unflatten_dict
 from jax import lax
 
-from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_llama import LlamaConfig
 
@@ -227,15 +227,12 @@ def __call__(
         init_cache: bool = False,
         output_attentions: bool = False,
     ):
-        # mismatch between here...
         query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
-
         query = self._split_heads(query)
         key = self._split_heads(key)
         value = self._split_heads(value)
-        # ...and here
 
         sincos = self.embed_positions[position_ids]
         sincos = jnp.split(sincos, 2, axis=-1)
@@ -299,9 +296,9 @@ def setup(self):
         jax.nn.initializers.normal(self.config.initializer_range)
         self.act = ACT2FN[self.config.hidden_act]
 
-        self.gate_proj = nn.Dense(self.intermediate_size, use_bias=False)
-        self.down_proj = nn.Dense(embed_dim, use_bias=False)
-        self.up_proj = nn.Dense(self.intermediate_size, use_bias=False)
+        self.gate_proj = nn.Dense(self.intermediate_size, use_bias=False, dtype=self.dtype)
+        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype)
+        self.up_proj = nn.Dense(self.intermediate_size, use_bias=False, dtype=self.dtype)
 
     def __call__(self, hidden_states):
         hidden_states = self.up_proj(hidden_states) * self.act(self.gate_proj(hidden_states))
@@ -385,7 +382,9 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
 
-        random_params = self.module.init(rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False)["params"]
+        random_params = self.module.init(
+            rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False
+        )["params"]
 
         if params is not None:
             random_params = flatten_dict(unfreeze(random_params))
@@ -412,7 +411,12 @@ def init_cache(self, batch_size, max_length):
         position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
 
         init_variables = self.module.init(
-            jax.random.PRNGKey(0), input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False, init_cache=True
+            jax.random.PRNGKey(0),
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            return_dict=False,
+            init_cache=True,
         )
         return unfreeze(init_variables["cache"])
 
@@ -506,7 +510,7 @@ def __call__(
         init_cache: bool = False,
         output_attentions: bool = False,
         output_hidden_states: bool = False,
-        return_dict: bool = False
+        return_dict: bool = False,
     ):
         all_attentions = () if output_attentions else None
         all_hidden_states = () if output_hidden_states else None
@@ -685,10 +689,11 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 # append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
 
-if __name__ == "__main__":
-    import torch
+
+def main():
+    jax.config.update("jax_platform_name", "cpu")
     torch.set_printoptions(precision=8)
-    jnp.set_printoptions(precision=8, floatmode='fixed')
+    jnp.set_printoptions(precision=8, floatmode="fixed")
 
     from .configuration_llama import LlamaConfig
     from .modeling_llama import LlamaForCausalLM
@@ -696,13 +701,20 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     key = jax.random.PRNGKey(0)
     torch.manual_seed(0)
 
-    # config = LlamaConfig(num_hidden_layers=1, vocab_size=64, hidden_size=64, num_attention_heads=8, max_position_embeddings=128, intermediate_size=256)
+    config = LlamaConfig(
+        num_hidden_layers=16,
+        vocab_size=64,
+        hidden_size=64,
+        num_attention_heads=8,
+        max_position_embeddings=128,
+        intermediate_size=256,
+    )
     # model = FlaxLlamaForCausalLM(config)
-    model = FlaxLlamaForCausalLM.from_pretrained("openlm-research/open_llama_3b", from_pt=True)
-    # print(config)
+    # model = FlaxLlamaForCausalLM.from_pretrained("openlm-research/open_llama_3b", from_pt=True, dtype=jnp.bfloat16)
+    print(config)
 
-    # model = FlaxLlamaForCausalLM(config)
-    # pt_model = LlamaForCausalLM(config)
+    model = FlaxLlamaForCausalLM(config)
+    pt_model = LlamaForCausalLM(config)
 
     N = 1
     key, subkey = jax.random.split(key)
@@ -711,16 +723,10 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(N, axis=0)
 
     key, model_key = jax.random.split(key)
-    # y, params = model.init_with_output(model_key, x, attention_mask=mask, position_ids=position_ids)
     params = model.params
-    # y = model(model_key, x, attention_mask=mask, position_ids=position_ids)
     y = model(x, attention_mask=mask, position_ids=position_ids)
     y = y[0]
 
-    print(y)
-    exit()
-
-    # params = flatten_dict(params["params"], sep=".")
     params = flatten_dict(params, sep=".")
 
     for i, l in enumerate(pt_model.model.layers):
@@ -756,11 +762,9 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
             }
         )
 
-    pt_model.model.embed_tokens.weight = torch.nn.Parameter(
-        torch.from_numpy(np.asarray(params["model.embed_tokens.embedding"]))
-    )
-    pt_model.model.norm.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params["model.norm.weight"])))
-    pt_model.lm_head.weight = torch.nn.Parameter(torch.from_numpy(np.asarray(params["lm_head.kernel"].T)))
+    pt_model.model.embed_tokens.weight.copy_(torch.from_numpy(np.asarray(params["model.embed_tokens.embedding"])))
+    pt_model.model.norm.weight.copy_(torch.from_numpy(np.asarray(params["model.norm.weight"])))
+    pt_model.lm_head.weight.copy_(torch.from_numpy(np.asarray(params["lm_head.kernel"].T)))
 
     x_pt = torch.tensor(np.asarray(x))
     pt_y = pt_model(
@@ -780,3 +784,9 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         ipdb.set_trace()
 
     print("done")
+
+
+if __name__ == "__main__":
+    import torch
+
+    torch.no_grad()(main)()

From e386c853a9129e60abe3455fc8406303a97fa71e Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sat, 5 Aug 2023 08:14:27 +0100
Subject: [PATCH 20/87] adds in model and integration tests for Flax Llama
 summary of failing: - mul invalid combination of dimensions - one numerical
 mismatch - bf16 conversion (maybe my local backend issue) - params are not
 FrozenDict

---
 src/transformers/__init__.py                  |   1 +
 .../models/auto/modeling_flax_auto.py         |   2 +
 src/transformers/models/llama/__init__.py     |  14 +-
 .../models/llama/modeling_flax_llama.py       |  17 +-
 .../models/llama/test_modeling_flax_llama.py  | 332 ++++++++++++++++++
 5 files changed, 357 insertions(+), 9 deletions(-)
 create mode 100644 tests/models/llama/test_modeling_flax_llama.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 97cc4e578c747a..e41a70306b5ba5 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4146,6 +4146,7 @@
         ["FlaxGPTNeoForCausalLM", "FlaxGPTNeoModel", "FlaxGPTNeoPreTrainedModel"]
     )
     _import_structure["models.gptj"].extend(["FlaxGPTJForCausalLM", "FlaxGPTJModel", "FlaxGPTJPreTrainedModel"])
+    _import_structure["models.llama"].extend(["FlaxLlamaForCausalLM", "FlaxLlamaModel"])
     _import_structure["models.longt5"].extend(
         ["FlaxLongT5ForConditionalGeneration", "FlaxLongT5Model", "FlaxLongT5PreTrainedModel"]
     )
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index ebc768963429c1..bf7d87e4e2dbd4 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -43,6 +43,7 @@
         ("gpt2", "FlaxGPT2Model"),
         ("gpt_neo", "FlaxGPTNeoModel"),
         ("gptj", "FlaxGPTJModel"),
+        ("llama", "FlaxLlamaModel"),
         ("longt5", "FlaxLongT5Model"),
         ("marian", "FlaxMarianModel"),
         ("mbart", "FlaxMBartModel"),
@@ -146,6 +147,7 @@
         ("gpt2", "FlaxGPT2LMHeadModel"),
         ("gpt_neo", "FlaxGPTNeoForCausalLM"),
         ("gptj", "FlaxGPTJForCausalLM"),
+        ("llama", "FlaxLlamaForCausalLM"),
         ("opt", "FlaxOPTForCausalLM"),
         ("roberta", "FlaxRobertaForCausalLM"),
         ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForCausalLM"),
diff --git a/src/transformers/models/llama/__init__.py b/src/transformers/models/llama/__init__.py
index 939756084d79ce..c478c835473390 100644
--- a/src/transformers/models/llama/__init__.py
+++ b/src/transformers/models/llama/__init__.py
@@ -19,6 +19,7 @@
     is_sentencepiece_available,
     is_tokenizers_available,
     is_torch_available,
+    is_flax_available
 )
 
 
@@ -43,7 +44,7 @@
     _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]
 
 try:
-    if not is_torch_available():
+    if not is_flax_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
@@ -55,6 +56,17 @@
         "LlamaForSequenceClassification",
     ]
 
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_flax_llama"] = [
+        "FlaxLlamaForCausalLM",
+        "FlaxLlamaModel"
+    ]
+
 
 if TYPE_CHECKING:
     from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 198a6bdd6a1c01..07e78a095ea79f 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -322,8 +322,8 @@ def setup(self):
     def __call__(
         self,
         hidden_states,
-        position_ids=None,
         attention_mask=None,
+        position_ids=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -424,8 +424,8 @@ def init_cache(self, batch_size, max_length):
     def __call__(
         self,
         input_ids,
-        position_ids=None,
         attention_mask=None,
+        position_ids=None,
         params: dict = None,
         past_key_values: dict = None,
         dropout_rng: jax.random.PRNGKey = None,
@@ -504,8 +504,8 @@ def setup(self):
     def __call__(
         self,
         hidden_states,
-        position_ids=None,
         attention_mask=None,
+        position_ids=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -555,8 +555,8 @@ def setup(self):
     def __call__(
         self,
         input_ids,
-        position_ids=None,
         attention_mask=None,
+        position_ids=None,
         deterministic=True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -622,8 +622,8 @@ def setup(self):
     def __call__(
         self,
         input_ids,
-        position_ids=None,
         attention_mask=None,
+        position_ids=None,
         deterministic: bool = True,
         init_cache: bool = False,
         output_attentions: bool = False,
@@ -691,7 +691,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 
 def main():
-    jax.config.update("jax_platform_name", "cpu")
+    # jax.config.update("jax_platform_name", "cpu")
     torch.set_printoptions(precision=8)
     jnp.set_printoptions(precision=8, floatmode="fixed")
 
@@ -710,13 +710,14 @@ def main():
         intermediate_size=256,
     )
     # model = FlaxLlamaForCausalLM(config)
-    # model = FlaxLlamaForCausalLM.from_pretrained("openlm-research/open_llama_3b", from_pt=True, dtype=jnp.bfloat16)
+    N = 1
+    model = FlaxLlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", from_pt=True, dtype=jnp.float16, input_shape=(N, 128))
     print(config)
+    exit()
 
     model = FlaxLlamaForCausalLM(config)
     pt_model = LlamaForCausalLM(config)
 
-    N = 1
     key, subkey = jax.random.split(key)
     x = jax.random.randint(subkey, (N, 128), 0, 64)
     mask = jnp.ones((N, 128), dtype=bool)
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
new file mode 100644
index 00000000000000..f92c0c26b68e52
--- /dev/null
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -0,0 +1,332 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import tempfile
+import unittest
+
+import numpy as np
+
+import transformers
+from transformers import LlamaTokenizer, LlamaConfig, is_flax_available, is_torch_available
+from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+
+from ...generation.test_flax_utils import FlaxGenerationTesterMixin
+from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_flax_available():
+    import jax
+    import jax.numpy as jnp
+
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
+    from transformers.models.llama.modeling_flax_llama import FlaxLlamaModel, FlaxLlamaForCausalLM
+
+if is_torch_available():
+    import torch
+
+
+class FlaxLlamaModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=14,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        attention_types=[[["global", "local"], 2]],
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        window_size=7,
+        initializer_range=0.02,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.attention_types = attention_types
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.window_size = window_size
+        self.initializer_range = initializer_range
+        self.scope = None
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        config = LlamaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            use_cache=False,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return (config, input_ids, input_mask)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
+        max_decoder_length = 20
+        model = model_class_name(config)
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
+        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
+
+        position_ids = jnp.broadcast_to(
+            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
+        )
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            attention_mask=attention_mask,
+            past_key_values=outputs_cache.past_key_values,
+            position_ids=position_ids,
+        )
+
+        outputs = model(input_ids)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
+        max_decoder_length = 20
+        model = model_class_name(config)
+
+        attention_mask_cache = jnp.concatenate(
+            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
+            axis=-1,
+        )
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
+        position_ids = jnp.broadcast_to(
+            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
+        )
+
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask_cache,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            past_key_values=outputs_cache.past_key_values,
+            attention_mask=attention_mask_cache,
+            position_ids=position_ids,
+        )
+
+        outputs = model(input_ids, attention_mask=attention_mask)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+
+@require_flax
+class FlaxLlamaModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxLlamaModel,) if is_flax_available() else ()
+    all_generative_model_classes = (FlaxLlamaForCausalLM,) if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_tester = FlaxLlamaModelTester(self)
+
+    def test_use_cache_forward(self):
+        for model_class_name in self.all_model_classes:
+            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
+            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
+
+    def test_use_cache_forward_with_attn_mask(self):
+        for model_class_name in self.all_model_classes:
+            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
+            self.model_tester.check_use_cache_forward_with_attn_mask(
+                model_class_name, config, input_ids, attention_mask
+            )
+
+    # @slow
+    # def test_batch_generation(self):
+        # tokenizer = LlamaTokenizer.from_pretrained("llama", pad_token="<|endoftext|>", padding_side="left")
+        # inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
+
+        # model = FlaxGPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
+        # model.do_sample = False
+        # model.config.pad_token_id = model.config.eos_token_id
+
+        # jit_generate = jax.jit(model.generate)
+
+        # output_sequences = jit_generate(
+            # inputs["input_ids"], attention_mask=inputs["attention_mask"], pad_token_id=tokenizer.pad_token_id
+        # ).sequences
+
+        # output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
+
+        # expected_string = [
+            # "Hello this is a long string of text.\n\nI'm trying to get the text of the",
+            # "Hey, I'm a little late to the party. I'm going to",
+        # ]
+
+        # self.assertListEqual(output_string, expected_string)
+
+    # overwrite from common since `attention_mask` in combination
+    # with `causal_mask` behaves slighly differently
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                batch_size, seq_length = pt_inputs["input_ids"].shape
+                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+                for batch_idx, start_index in enumerate(rnd_start_indices):
+                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+                pt_model = pt_model_class(config).eval()
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
+                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+    # overwrite from common since `attention_mask` in combination
+    # with `causal_mask` behaves slighly differently
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                pt_model = pt_model_class(config).eval()
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+                batch_size, seq_length = pt_inputs["input_ids"].shape
+                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+                for batch_idx, start_index in enumerate(rnd_start_indices):
+                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+                self.assertEqual(
+                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
+                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("openlm-research/open_llama_3b_v2")
+            outputs = model(np.ones((1, 1)))
+            self.assertIsNotNone(outputs)

From 539d041ccdfc45dbbb1b777d51e994b7839e5409 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sat, 5 Aug 2023 08:20:02 +0100
Subject: [PATCH 21/87] adds missing TYPE_CHECKING import and `make fixup`

---
 src/transformers/__init__.py                  |  1 +
 src/transformers/models/llama/__init__.py     |  7 ++--
 .../models/llama/modeling_flax_llama.py       |  4 ++-
 .../models/llama/test_modeling_flax_llama.py  | 35 +++++++++----------
 4 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e41a70306b5ba5..0725d6d7d66279 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -7705,6 +7705,7 @@
         from .models.gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model, FlaxGPT2PreTrainedModel
         from .models.gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel, FlaxGPTNeoPreTrainedModel
         from .models.gptj import FlaxGPTJForCausalLM, FlaxGPTJModel, FlaxGPTJPreTrainedModel
+        from .models.llama import FlaxLlamaForCausalLM, FlaxLlamaModel
         from .models.longt5 import FlaxLongT5ForConditionalGeneration, FlaxLongT5Model, FlaxLongT5PreTrainedModel
         from .models.marian import FlaxMarianModel, FlaxMarianMTModel, FlaxMarianPreTrainedModel
         from .models.mbart import (
diff --git a/src/transformers/models/llama/__init__.py b/src/transformers/models/llama/__init__.py
index c478c835473390..7fb601ebb7b74a 100644
--- a/src/transformers/models/llama/__init__.py
+++ b/src/transformers/models/llama/__init__.py
@@ -16,10 +16,10 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
+    is_flax_available,
     is_sentencepiece_available,
     is_tokenizers_available,
     is_torch_available,
-    is_flax_available
 )
 
 
@@ -62,10 +62,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_flax_llama"] = [
-        "FlaxLlamaForCausalLM",
-        "FlaxLlamaModel"
-    ]
+    _import_structure["modeling_flax_llama"] = ["FlaxLlamaForCausalLM", "FlaxLlamaModel"]
 
 
 if TYPE_CHECKING:
diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 07e78a095ea79f..1f46d7132f4791 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -711,7 +711,9 @@ def main():
     )
     # model = FlaxLlamaForCausalLM(config)
     N = 1
-    model = FlaxLlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", from_pt=True, dtype=jnp.float16, input_shape=(N, 128))
+    model = FlaxLlamaForCausalLM.from_pretrained(
+        "decapoda-research/llama-7b-hf", from_pt=True, dtype=jnp.float16, input_shape=(N, 128)
+    )
     print(config)
     exit()
 
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index f92c0c26b68e52..1f42d0a51a209f 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -19,7 +19,7 @@
 import numpy as np
 
 import transformers
-from transformers import LlamaTokenizer, LlamaConfig, is_flax_available, is_torch_available
+from transformers import LlamaConfig, is_flax_available, is_torch_available
 from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
 
 from ...generation.test_flax_utils import FlaxGenerationTesterMixin
@@ -27,14 +27,13 @@
 
 
 if is_flax_available():
-    import jax
     import jax.numpy as jnp
 
     from transformers.modeling_flax_pytorch_utils import (
         convert_pytorch_state_dict_to_flax,
         load_flax_weights_in_pytorch_model,
     )
-    from transformers.models.llama.modeling_flax_llama import FlaxLlamaModel, FlaxLlamaForCausalLM
+    from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
 
 if is_torch_available():
     import torch
@@ -203,27 +202,27 @@ def test_use_cache_forward_with_attn_mask(self):
 
     # @slow
     # def test_batch_generation(self):
-        # tokenizer = LlamaTokenizer.from_pretrained("llama", pad_token="<|endoftext|>", padding_side="left")
-        # inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
+    # tokenizer = LlamaTokenizer.from_pretrained("llama", pad_token="<|endoftext|>", padding_side="left")
+    # inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
 
-        # model = FlaxGPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
-        # model.do_sample = False
-        # model.config.pad_token_id = model.config.eos_token_id
+    # model = FlaxGPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
+    # model.do_sample = False
+    # model.config.pad_token_id = model.config.eos_token_id
 
-        # jit_generate = jax.jit(model.generate)
+    # jit_generate = jax.jit(model.generate)
 
-        # output_sequences = jit_generate(
-            # inputs["input_ids"], attention_mask=inputs["attention_mask"], pad_token_id=tokenizer.pad_token_id
-        # ).sequences
+    # output_sequences = jit_generate(
+    # inputs["input_ids"], attention_mask=inputs["attention_mask"], pad_token_id=tokenizer.pad_token_id
+    # ).sequences
 
-        # output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
+    # output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
 
-        # expected_string = [
-            # "Hello this is a long string of text.\n\nI'm trying to get the text of the",
-            # "Hey, I'm a little late to the party. I'm going to",
-        # ]
+    # expected_string = [
+    # "Hello this is a long string of text.\n\nI'm trying to get the text of the",
+    # "Hey, I'm a little late to the party. I'm going to",
+    # ]
 
-        # self.assertListEqual(output_string, expected_string)
+    # self.assertListEqual(output_string, expected_string)
 
     # overwrite from common since `attention_mask` in combination
     # with `causal_mask` behaves slighly differently

From c695d0aed4dc090208dced363c4687eb75a60f64 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sat, 5 Aug 2023 08:41:42 +0100
Subject: [PATCH 22/87] adds back missing docstrings needs review on quality of
 docstrings, not sure what is required. Furthermore, need to check if
 `CHECKPOINT_FOR_DOC` is valid. See TODO

---
 src/transformers/models/llama/modeling_flax_llama.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 1f46d7132f4791..90aa94a95057b6 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -27,7 +27,7 @@
 from jax import lax
 
 from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
-from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
 from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_llama import LlamaConfig
 
@@ -35,7 +35,7 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LlamaConfig"
-
+_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b" # TODO: is this an appropriate checkpoint?
 
 LLAMA_START_DOCSTRING = r"""
 
@@ -351,7 +351,6 @@ def __call__(
         return (hidden_states,) + outputs[1:]
 
 
-# TODO: check this is ported
 class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -603,7 +602,7 @@ class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
     module_class = FlaxLlamaModule
 
 
-# append_call_sample_docstring(FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
+append_call_sample_docstring(FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
 
 
 class FlaxLlamaForCausalLMModule(nn.Module):
@@ -687,7 +686,7 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         return model_kwargs
 
 
-# append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
+append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
 
 
 def main():

From 1776e753f819639e5577855ec3918e11a2ea9959 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 6 Aug 2023 08:01:37 +0100
Subject: [PATCH 23/87] commenting out equivalence test as can just use common

---
 .../models/llama/test_modeling_flax_llama.py  | 192 +++++++++---------
 1 file changed, 96 insertions(+), 96 deletions(-)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 1f42d0a51a209f..77f39724242c70 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -226,102 +226,102 @@ def test_use_cache_forward_with_attn_mask(self):
 
     # overwrite from common since `attention_mask` in combination
     # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+    # @is_pt_flax_cross_test
+    # def test_equivalence_pt_to_flax(self):
+        # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        # for model_class in self.all_model_classes:
+            # with self.subTest(model_class.__name__):
+                # # prepare inputs
+                # prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                # pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # # load corresponding PyTorch class
+                # pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                # pt_model_class = getattr(transformers, pt_model_class_name)
+
+                # batch_size, seq_length = pt_inputs["input_ids"].shape
+                # rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+                # for batch_idx, start_index in enumerate(rnd_start_indices):
+                    # pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+                    # pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+                    # prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+                    # prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+                # pt_model = pt_model_class(config).eval()
+                # fx_model = model_class(config, dtype=jnp.float32)
+
+                # fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                # fx_model.params = fx_state
+
+                # with torch.no_grad():
+                    # pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                # fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                # self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                # for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+                # with tempfile.TemporaryDirectory() as tmpdirname:
+                    # pt_model.save_pretrained(tmpdirname)
+                    # fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                # fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
+                # self.assertEqual(
+                    # len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+                # )
+                # for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
+                    # self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+    # # overwrite from common since `attention_mask` in combination
+    # # with `causal_mask` behaves slighly differently
+    # @is_pt_flax_cross_test
+    # def test_equivalence_flax_to_pt(self):
+        # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        # for model_class in self.all_model_classes:
+            # with self.subTest(model_class.__name__):
+                # # prepare inputs
+                # prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                # pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # # load corresponding PyTorch class
+                # pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                # pt_model_class = getattr(transformers, pt_model_class_name)
+
+                # pt_model = pt_model_class(config).eval()
+                # fx_model = model_class(config, dtype=jnp.float32)
+
+                # pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+                # batch_size, seq_length = pt_inputs["input_ids"].shape
+                # rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+                # for batch_idx, start_index in enumerate(rnd_start_indices):
+                    # pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+                    # pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+                    # prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+                    # prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+
+                # # make sure weights are tied in PyTorch
+                # pt_model.tie_weights()
+
+                # with torch.no_grad():
+                    # pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                # fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                # self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                # for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+                # with tempfile.TemporaryDirectory() as tmpdirname:
+                    # fx_model.save_pretrained(tmpdirname)
+                    # pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                # with torch.no_grad():
+                    # pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+                # self.assertEqual(
+                    # len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+                # )
+                # for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
+                    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
 
     @slow
     def test_model_from_pretrained(self):

From 3b4f55a2d6c94aa4989811cbb6a06ea383a88e1e Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 10 Aug 2023 08:32:02 +0100
Subject: [PATCH 24/87] debugging

---
 .../models/llama/modeling_flax_llama.py       | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 90aa94a95057b6..214435fd8b6a16 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -701,26 +701,25 @@ def main():
     torch.manual_seed(0)
 
     config = LlamaConfig(
-        num_hidden_layers=16,
-        vocab_size=64,
-        hidden_size=64,
-        num_attention_heads=8,
+        num_hidden_layers=4,
+        vocab_size=16,
+        hidden_size=16,
+        num_attention_heads=2,
         max_position_embeddings=128,
-        intermediate_size=256,
+        intermediate_size=64,
     )
-    # model = FlaxLlamaForCausalLM(config)
     N = 1
-    model = FlaxLlamaForCausalLM.from_pretrained(
-        "decapoda-research/llama-7b-hf", from_pt=True, dtype=jnp.float16, input_shape=(N, 128)
-    )
-    print(config)
-    exit()
+    # model = FlaxLlamaForCausalLM.from_pretrained(
+    #     "decapoda-research/llama-7b-hf", from_pt=True, dtype=jnp.float16, input_shape=(N, 128)
+    # )
+    # print(config)
+    # exit()
 
     model = FlaxLlamaForCausalLM(config)
     pt_model = LlamaForCausalLM(config)
 
     key, subkey = jax.random.split(key)
-    x = jax.random.randint(subkey, (N, 128), 0, 64)
+    x = jax.random.randint(subkey, (N, 128), 0, 16)
     mask = jnp.ones((N, 128), dtype=bool)
     position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(N, axis=0)
 

From 3fa9f2ed4fa5043f28c4f175625eae64374eb890 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 10 Aug 2023 09:00:30 +0100
Subject: [PATCH 25/87] =?UTF-8?q?Fixes=20bug=20where=20mask=20and=20pos=5F?=
 =?UTF-8?q?ids=20were=20swapped=20in=20pretrained=20models=20This=20result?=
 =?UTF-8?q?s=20in=20all=20tests=20passing=20now=20=F0=9F=94=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/llama/modeling_flax_llama.py       |   6 +-
 .../models/llama/test_modeling_flax_llama.py  | 190 +++++++++---------
 2 files changed, 95 insertions(+), 101 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 214435fd8b6a16..8b5ec0417f1b54 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -35,7 +35,7 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LlamaConfig"
-_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b" # TODO: is this an appropriate checkpoint?
+_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b"  # TODO: is this an appropriate checkpoint?
 
 LLAMA_START_DOCSTRING = r"""
 
@@ -332,8 +332,8 @@ def __call__(
         hidden_states = self.input_layernorm(hidden_states)
         outputs = self.self_attn(
             hidden_states,
-            position_ids=position_ids,
             attention_mask=attention_mask,
+            position_ids=position_ids,
             deterministic=deterministic,
             init_cache=init_cache,
             output_attentions=output_attentions,
@@ -467,8 +467,8 @@ def __call__(
         outputs = self.module.apply(
             inputs,
             jnp.array(input_ids, dtype="i4"),
-            jnp.array(position_ids, dtype="i4"),
             jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
             not train,
             False,
             output_attentions,
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 77f39724242c70..2da9a1ee93cc71 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -13,14 +13,12 @@
 # limitations under the License.
 
 
-import tempfile
 import unittest
 
 import numpy as np
 
-import transformers
 from transformers import LlamaConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers.testing_utils import require_flax, slow
 
 from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
@@ -29,14 +27,10 @@
 if is_flax_available():
     import jax.numpy as jnp
 
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
     from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
 
 if is_torch_available():
-    import torch
+    pass
 
 
 class FlaxLlamaModelTester:
@@ -228,104 +222,104 @@ def test_use_cache_forward_with_attn_mask(self):
     # with `causal_mask` behaves slighly differently
     # @is_pt_flax_cross_test
     # def test_equivalence_pt_to_flax(self):
-        # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        # for model_class in self.all_model_classes:
-            # with self.subTest(model_class.__name__):
-                # # prepare inputs
-                # prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                # pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # # load corresponding PyTorch class
-                # pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                # pt_model_class = getattr(transformers, pt_model_class_name)
-
-                # batch_size, seq_length = pt_inputs["input_ids"].shape
-                # rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                # for batch_idx, start_index in enumerate(rnd_start_indices):
-                    # pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    # pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    # prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    # prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                # pt_model = pt_model_class(config).eval()
-                # fx_model = model_class(config, dtype=jnp.float32)
-
-                # fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                # fx_model.params = fx_state
-
-                # with torch.no_grad():
-                    # pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                # self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                # for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                # with tempfile.TemporaryDirectory() as tmpdirname:
-                    # pt_model.save_pretrained(tmpdirname)
-                    # fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                # fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                # self.assertEqual(
-                    # len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                # )
-                # for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    # self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
+    # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+    # for model_class in self.all_model_classes:
+    # with self.subTest(model_class.__name__):
+    # # prepare inputs
+    # prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+    # pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+    # # load corresponding PyTorch class
+    # pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+    # pt_model_class = getattr(transformers, pt_model_class_name)
+
+    # batch_size, seq_length = pt_inputs["input_ids"].shape
+    # rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+    # for batch_idx, start_index in enumerate(rnd_start_indices):
+    # pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+    # pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+    # prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+    # prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+    # pt_model = pt_model_class(config).eval()
+    # fx_model = model_class(config, dtype=jnp.float32)
+
+    # fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+    # fx_model.params = fx_state
+
+    # with torch.no_grad():
+    # pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+    # fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+    # self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+    # for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+    # with tempfile.TemporaryDirectory() as tmpdirname:
+    # pt_model.save_pretrained(tmpdirname)
+    # fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
+
+    # fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
+    # self.assertEqual(
+    # len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+    # )
+    # for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
+    # self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
 
     # # overwrite from common since `attention_mask` in combination
     # # with `causal_mask` behaves slighly differently
     # @is_pt_flax_cross_test
     # def test_equivalence_flax_to_pt(self):
-        # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        # for model_class in self.all_model_classes:
-            # with self.subTest(model_class.__name__):
-                # # prepare inputs
-                # prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                # pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # # load corresponding PyTorch class
-                # pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                # pt_model_class = getattr(transformers, pt_model_class_name)
-
-                # pt_model = pt_model_class(config).eval()
-                # fx_model = model_class(config, dtype=jnp.float32)
-
-                # pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                # batch_size, seq_length = pt_inputs["input_ids"].shape
-                # rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                # for batch_idx, start_index in enumerate(rnd_start_indices):
-                    # pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    # pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    # prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    # prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # # make sure weights are tied in PyTorch
-                # pt_model.tie_weights()
-
-                # with torch.no_grad():
-                    # pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                # fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                # self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                # for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                # with tempfile.TemporaryDirectory() as tmpdirname:
-                    # fx_model.save_pretrained(tmpdirname)
-                    # pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                # with torch.no_grad():
-                    # pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                # self.assertEqual(
-                    # len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                # )
-                # for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+    # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    # for model_class in self.all_model_classes:
+    # with self.subTest(model_class.__name__):
+    # # prepare inputs
+    # prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+    # pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+    # # load corresponding PyTorch class
+    # pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+    # pt_model_class = getattr(transformers, pt_model_class_name)
+
+    # pt_model = pt_model_class(config).eval()
+    # fx_model = model_class(config, dtype=jnp.float32)
+
+    # pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+    # batch_size, seq_length = pt_inputs["input_ids"].shape
+    # rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+    # for batch_idx, start_index in enumerate(rnd_start_indices):
+    # pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+    # pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+    # prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+    # prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+
+    # # make sure weights are tied in PyTorch
+    # pt_model.tie_weights()
+
+    # with torch.no_grad():
+    # pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+    # fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+    # self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+    # for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+    # with tempfile.TemporaryDirectory() as tmpdirname:
+    # fx_model.save_pretrained(tmpdirname)
+    # pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
+
+    # with torch.no_grad():
+    # pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+    # self.assertEqual(
+    # len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+    # )
+    # for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
+    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
 
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
-            model = model_class_name.from_pretrained("openlm-research/open_llama_3b_v2")
+            model = model_class_name.from_pretrained("openlm-research/open_llama_3b_v2", from_pt=True)
             outputs = model(np.ones((1, 1)))
             self.assertIsNotNone(outputs)

From 9d4bdadbe3f309388f789cab35c91f9521827b95 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 11 Aug 2023 07:01:50 +0100
Subject: [PATCH 26/87] cleanup of modeling file

---
 .../models/llama/modeling_flax_llama.py       | 138 +++---------------
 1 file changed, 23 insertions(+), 115 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 8b5ec0417f1b54..3e5c6e38d5b9f8 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -1,5 +1,10 @@
 # coding=utf-8
-# Copyright 2021 The Eleuther AI and The Google Flax Team Authors and The HuggingFace Inc. team.
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +18,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# TODO: please verif the above license
+
 from functools import partial
 from typing import Optional, Tuple
 
@@ -35,7 +42,7 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LlamaConfig"
-_CHECKPOINT_FOR_DOC = "meta-llama/Llama-2-7b"  # TODO: is this an appropriate checkpoint?
+_CHECKPOINT_FOR_DOC = "openlm-research/open_llama_3b_v2" # TODO: is this checkpoint appropriate?
 
 LLAMA_START_DOCSTRING = r"""
 
@@ -58,6 +65,18 @@
         config ([`LlamaConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16`, or
+            `jax.numpy.bfloat16`.
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
 """
 
 LLAMA_INPUTS_DOCSTRING = r"""
@@ -98,13 +117,6 @@
         past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
             Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
             auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
-        inputs_embeds (`np.array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -464,6 +476,7 @@ def __call__(
         else:
             mutable = False
 
+        # TODO: can this handle input tensors being passed as kwargs? I copied GPT-Neo directly here
         outputs = self.module.apply(
             inputs,
             jnp.array(input_ids, dtype="i4"),
@@ -651,8 +664,7 @@ def __call__(
 
 @add_start_docstrings(
     """
-    The Llama Model transformer with a language modeling head on top (linear layer with weights tied to the input
-    embeddings).
+    The Llama Model transformer with a language modeling head (linear layer) on top.
     """,
     LLAMA_START_DOCSTRING,
 )
@@ -687,107 +699,3 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 
 append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
-
-
-def main():
-    # jax.config.update("jax_platform_name", "cpu")
-    torch.set_printoptions(precision=8)
-    jnp.set_printoptions(precision=8, floatmode="fixed")
-
-    from .configuration_llama import LlamaConfig
-    from .modeling_llama import LlamaForCausalLM
-
-    key = jax.random.PRNGKey(0)
-    torch.manual_seed(0)
-
-    config = LlamaConfig(
-        num_hidden_layers=4,
-        vocab_size=16,
-        hidden_size=16,
-        num_attention_heads=2,
-        max_position_embeddings=128,
-        intermediate_size=64,
-    )
-    N = 1
-    # model = FlaxLlamaForCausalLM.from_pretrained(
-    #     "decapoda-research/llama-7b-hf", from_pt=True, dtype=jnp.float16, input_shape=(N, 128)
-    # )
-    # print(config)
-    # exit()
-
-    model = FlaxLlamaForCausalLM(config)
-    pt_model = LlamaForCausalLM(config)
-
-    key, subkey = jax.random.split(key)
-    x = jax.random.randint(subkey, (N, 128), 0, 16)
-    mask = jnp.ones((N, 128), dtype=bool)
-    position_ids = jnp.arange(128)[jnp.newaxis, :].repeat(N, axis=0)
-
-    key, model_key = jax.random.split(key)
-    params = model.params
-    y = model(x, attention_mask=mask, position_ids=position_ids)
-    y = y[0]
-
-    params = flatten_dict(params, sep=".")
-
-    for i, l in enumerate(pt_model.model.layers):
-        pt_state = l.state_dict()
-        l.load_state_dict(
-            {
-                "self_attn.q_proj.weight": torch.from_numpy(
-                    np.asarray(params[f"model.layers.{i}.self_attn.q_proj.kernel"])
-                ).T,
-                "self_attn.k_proj.weight": torch.from_numpy(
-                    np.asarray(params[f"model.layers.{i}.self_attn.k_proj.kernel"])
-                ).T,
-                "self_attn.v_proj.weight": torch.from_numpy(
-                    np.asarray(params[f"model.layers.{i}.self_attn.v_proj.kernel"])
-                ).T,
-                "self_attn.o_proj.weight": torch.from_numpy(
-                    np.asarray(params[f"model.layers.{i}.self_attn.o_proj.kernel"])
-                ).T,
-                "self_attn.rotary_emb.inv_freq": pt_state["self_attn.rotary_emb.inv_freq"],
-                "input_layernorm.weight": torch.from_numpy(
-                    np.asarray(params[f"model.layers.{i}.input_layernorm.weight"])
-                ),
-                "post_attention_layernorm.weight": torch.from_numpy(
-                    np.asarray(params[f"model.layers.{i}.post_attention_layernorm.weight"])
-                ),
-                "mlp.down_proj.weight": torch.from_numpy(
-                    np.asarray(params[f"model.layers.{i}.mlp.down_proj.kernel"])
-                ).T,
-                "mlp.up_proj.weight": torch.from_numpy(np.asarray(params[f"model.layers.{i}.mlp.up_proj.kernel"])).T,
-                "mlp.gate_proj.weight": torch.from_numpy(
-                    np.asarray(params[f"model.layers.{i}.mlp.gate_proj.kernel"])
-                ).T,
-            }
-        )
-
-    pt_model.model.embed_tokens.weight.copy_(torch.from_numpy(np.asarray(params["model.embed_tokens.embedding"])))
-    pt_model.model.norm.weight.copy_(torch.from_numpy(np.asarray(params["model.norm.weight"])))
-    pt_model.lm_head.weight.copy_(torch.from_numpy(np.asarray(params["lm_head.kernel"].T)))
-
-    x_pt = torch.tensor(np.asarray(x))
-    pt_y = pt_model(
-        x_pt,
-        attention_mask=torch.from_numpy(np.asarray(mask)),
-        position_ids=torch.from_numpy(np.asarray(position_ids)),
-    )[0]
-
-    pt_y = pt_y.detach().numpy()
-
-    try:
-        np.testing.assert_allclose(y, pt_y, atol=1e-5, rtol=1e-5)
-    except AssertionError as e:
-        print(e)
-        import ipdb
-
-        ipdb.set_trace()
-
-    print("done")
-
-
-if __name__ == "__main__":
-    import torch
-
-    torch.no_grad()(main)()

From ffb5e472fda17a0083c7548bf34d780a8fc58804 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 11 Aug 2023 07:02:24 +0100
Subject: [PATCH 27/87] cleanup of test file

---
 .../models/llama/test_modeling_flax_llama.py  | 123 ------------------
 1 file changed, 123 deletions(-)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 2da9a1ee93cc71..6102d077c06222 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -194,129 +194,6 @@ def test_use_cache_forward_with_attn_mask(self):
                 model_class_name, config, input_ids, attention_mask
             )
 
-    # @slow
-    # def test_batch_generation(self):
-    # tokenizer = LlamaTokenizer.from_pretrained("llama", pad_token="<|endoftext|>", padding_side="left")
-    # inputs = tokenizer(["Hello this is a long string", "Hey"], return_tensors="np", padding=True, truncation=True)
-
-    # model = FlaxGPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")
-    # model.do_sample = False
-    # model.config.pad_token_id = model.config.eos_token_id
-
-    # jit_generate = jax.jit(model.generate)
-
-    # output_sequences = jit_generate(
-    # inputs["input_ids"], attention_mask=inputs["attention_mask"], pad_token_id=tokenizer.pad_token_id
-    # ).sequences
-
-    # output_string = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
-
-    # expected_string = [
-    # "Hello this is a long string of text.\n\nI'm trying to get the text of the",
-    # "Hey, I'm a little late to the party. I'm going to",
-    # ]
-
-    # self.assertListEqual(output_string, expected_string)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    # @is_pt_flax_cross_test
-    # def test_equivalence_pt_to_flax(self):
-    # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-    # for model_class in self.all_model_classes:
-    # with self.subTest(model_class.__name__):
-    # # prepare inputs
-    # prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-    # pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-    # # load corresponding PyTorch class
-    # pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-    # pt_model_class = getattr(transformers, pt_model_class_name)
-
-    # batch_size, seq_length = pt_inputs["input_ids"].shape
-    # rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-    # for batch_idx, start_index in enumerate(rnd_start_indices):
-    # pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-    # pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-    # prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-    # prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-    # pt_model = pt_model_class(config).eval()
-    # fx_model = model_class(config, dtype=jnp.float32)
-
-    # fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-    # fx_model.params = fx_state
-
-    # with torch.no_grad():
-    # pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-    # fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-    # self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-    # for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # with tempfile.TemporaryDirectory() as tmpdirname:
-    # pt_model.save_pretrained(tmpdirname)
-    # fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-    # fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-    # self.assertEqual(
-    # len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-    # )
-    # for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-    # self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # # overwrite from common since `attention_mask` in combination
-    # # with `causal_mask` behaves slighly differently
-    # @is_pt_flax_cross_test
-    # def test_equivalence_flax_to_pt(self):
-    # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    # for model_class in self.all_model_classes:
-    # with self.subTest(model_class.__name__):
-    # # prepare inputs
-    # prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-    # pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-    # # load corresponding PyTorch class
-    # pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-    # pt_model_class = getattr(transformers, pt_model_class_name)
-
-    # pt_model = pt_model_class(config).eval()
-    # fx_model = model_class(config, dtype=jnp.float32)
-
-    # pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-    # batch_size, seq_length = pt_inputs["input_ids"].shape
-    # rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-    # for batch_idx, start_index in enumerate(rnd_start_indices):
-    # pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-    # pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-    # prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-    # prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-    # # make sure weights are tied in PyTorch
-    # pt_model.tie_weights()
-
-    # with torch.no_grad():
-    # pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-    # fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-    # self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-    # for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # with tempfile.TemporaryDirectory() as tmpdirname:
-    # fx_model.save_pretrained(tmpdirname)
-    # pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-    # with torch.no_grad():
-    # pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-    # self.assertEqual(
-    # len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-    # )
-    # for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-    # self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:

From 020bd4e6052cfdd76b06dc57895019be396f2f45 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Thu, 17 Aug 2023 21:24:59 +0100
Subject: [PATCH 28/87] Resolving simpler review comments

---
 .../models/llama/modeling_flax_llama.py       | 38 ++++++++++---------
 .../models/llama/test_modeling_flax_llama.py  | 10 ++---
 2 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 3e5c6e38d5b9f8..14a7763eb90bd1 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -18,8 +18,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: please verif the above license
-
 from functools import partial
 from typing import Optional, Tuple
 
@@ -42,7 +40,7 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LlamaConfig"
-_CHECKPOINT_FOR_DOC = "openlm-research/open_llama_3b_v2" # TODO: is this checkpoint appropriate?
+_CHECKPOINT_FOR_DOC = "openlm-research/open_llama_3b_v2"
 
 LLAMA_START_DOCSTRING = r"""
 
@@ -149,21 +147,23 @@ def apply_rotary_pos_emb(tensor, sincos):
 
 
 class FlaxLlamaRMSNorm(nn.Module):
-    eps: float = 1e-6
+    config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        self.epsilon = self.config.rms_norm_eps
+        self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size)
 
     @nn.compact
     def __call__(self, hidden_states):
         input_dtype = hidden_states.dtype
-        variance = jnp.asarray(hidden_states, dtype=jnp.float32)
+        variance = jnp.asarray(hidden_states, dtype=self.dtype)
         variance = jnp.power(variance, 2)
         variance = variance.mean(-1, keepdims=True)
         # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt`
-        # hidden_states = hidden_states * jax.lax.rsqrt(variance + self.eps)
-        hidden_states = hidden_states / jnp.sqrt(variance + self.eps)
-
-        weight = self.param("weight", lambda _, shape: jnp.ones(shape), hidden_states.shape[-1])
+        # hidden_states = hidden_states * jax.lax.rsqrt(variance + self.epsilon)
+        hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon)
 
-        return jnp.asarray(weight * hidden_states, dtype=input_dtype)
+        return jnp.asarray(self.weight * hidden_states, dtype=input_dtype)
 
 
 class FlaxLlamaAttention(nn.Module):
@@ -198,6 +198,7 @@ def _split_heads(self, hidden_states):
     def _merge_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
 
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
     @nn.compact
     def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
@@ -242,6 +243,7 @@ def __call__(
         query = self.q_proj(hidden_states)
         key = self.k_proj(hidden_states)
         value = self.v_proj(hidden_states)
+
         query = self._split_heads(query)
         key = self._split_heads(key)
         value = self._split_heads(value)
@@ -300,17 +302,18 @@ def __call__(
 
 class FlaxLlamaMLP(nn.Module):
     config: LlamaConfig
-    intermediate_size: int
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
         embed_dim = self.config.hidden_size
+        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
+
         jax.nn.initializers.normal(self.config.initializer_range)
         self.act = ACT2FN[self.config.hidden_act]
 
-        self.gate_proj = nn.Dense(self.intermediate_size, use_bias=False, dtype=self.dtype)
+        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype)
         self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype)
-        self.up_proj = nn.Dense(self.intermediate_size, use_bias=False, dtype=self.dtype)
+        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype)
 
     def __call__(self, hidden_states):
         hidden_states = self.up_proj(hidden_states) * self.act(self.gate_proj(hidden_states))
@@ -324,12 +327,11 @@ class FlaxLlamaDecoderLayer(nn.Module):
 
     def setup(self):
         hidden_size = self.config.hidden_size
-        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
 
-        self.input_layernorm = FlaxLlamaRMSNorm(eps=self.config.rms_norm_eps)
+        self.input_layernorm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
         self.self_attn = FlaxLlamaAttention(self.config, dtype=self.dtype)
-        self.post_attention_layernorm = FlaxLlamaRMSNorm(eps=self.config.rms_norm_eps)
-        self.mlp = FlaxLlamaMLP(self.config, inner_dim, dtype=self.dtype)
+        self.post_attention_layernorm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
+        self.mlp = FlaxLlamaMLP(self.config, dtype=self.dtype)
 
     def __call__(
         self,
@@ -476,7 +478,6 @@ def __call__(
         else:
             mutable = False
 
-        # TODO: can this handle input tensors being passed as kwargs? I copied GPT-Neo directly here
         outputs = self.module.apply(
             inputs,
             jnp.array(input_ids, dtype="i4"),
@@ -560,6 +561,7 @@ def setup(self):
             self.config.vocab_size,
             self.hidden_size,
             embedding_init=embedding_init,
+            dtype=self.dtype,
         )
         self.layers = FlaxLlamaBlockCollection(self.config, dtype=self.dtype)
         self.norm = FlaxLlamaRMSNorm(self.config.rms_norm_eps)
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 6102d077c06222..1108a7a072d1b8 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -37,18 +37,18 @@ class FlaxLlamaModelTester:
     def __init__(
         self,
         parent,
-        batch_size=14,
+        batch_size=2,
         seq_length=7,
         is_training=True,
         use_input_mask=True,
         use_token_type_ids=False,
         use_labels=True,
         vocab_size=99,
-        hidden_size=32,
-        num_hidden_layers=4,
-        num_attention_heads=4,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
         attention_types=[[["global", "local"], 2]],
-        intermediate_size=37,
+        intermediate_size=64,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,

From e9e391fd405c07558cbc44efb14c759b14319af0 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Wed, 23 Aug 2023 07:35:47 +0100
Subject: [PATCH 29/87] addresses more minor review comments

---
 .../models/llama/modeling_flax_llama.py       | 20 +++++++++----------
 .../models/llama/test_modeling_flax_llama.py  |  2 +-
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 14a7763eb90bd1..cfc60a16925f02 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -148,22 +148,21 @@ def apply_rotary_pos_emb(tensor, sincos):
 
 class FlaxLlamaRMSNorm(nn.Module):
     config: LlamaConfig
-    dtype: jnp.dtype = jnp.float32
+
     def setup(self):
         self.epsilon = self.config.rms_norm_eps
         self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size)
 
-    @nn.compact
     def __call__(self, hidden_states):
         input_dtype = hidden_states.dtype
-        variance = jnp.asarray(hidden_states, dtype=self.dtype)
+        variance = jnp.asarray(hidden_states, dtype=jnp.float32)
         variance = jnp.power(variance, 2)
         variance = variance.mean(-1, keepdims=True)
         # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt`
         # hidden_states = hidden_states * jax.lax.rsqrt(variance + self.epsilon)
         hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon)
 
-        return jnp.asarray(self.weight * hidden_states, dtype=input_dtype)
+        return self.weight * jnp.asarray(hidden_states, dtype=input_dtype)
 
 
 class FlaxLlamaAttention(nn.Module):
@@ -306,7 +305,7 @@ class FlaxLlamaMLP(nn.Module):
 
     def setup(self):
         embed_dim = self.config.hidden_size
-        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * hidden_size
+        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim
 
         jax.nn.initializers.normal(self.config.initializer_range)
         self.act = ACT2FN[self.config.hidden_act]
@@ -326,11 +325,9 @@ class FlaxLlamaDecoderLayer(nn.Module):
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
-        hidden_size = self.config.hidden_size
-
-        self.input_layernorm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
+        self.input_layernorm = FlaxLlamaRMSNorm(self.config)
         self.self_attn = FlaxLlamaAttention(self.config, dtype=self.dtype)
-        self.post_attention_layernorm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
+        self.post_attention_layernorm = FlaxLlamaRMSNorm(self.config)
         self.mlp = FlaxLlamaMLP(self.config, dtype=self.dtype)
 
     def __call__(
@@ -365,6 +362,7 @@ def __call__(
         return (hidden_states,) + outputs[1:]
 
 
+# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Llama
 class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -504,7 +502,7 @@ def __call__(
         return outputs
 
 
-class FlaxLlamaBlockCollection(nn.Module):
+class FlaxLlamaLayerCollection(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
 
@@ -563,7 +561,7 @@ def setup(self):
             embedding_init=embedding_init,
             dtype=self.dtype,
         )
-        self.layers = FlaxLlamaBlockCollection(self.config, dtype=self.dtype)
+        self.layers = FlaxLlamaLayerCollection(self.config, dtype=self.dtype)
         self.norm = FlaxLlamaRMSNorm(self.config.rms_norm_eps)
 
     def __call__(
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 1108a7a072d1b8..d7a41a5d88863d 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -176,7 +176,7 @@ def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input
 
 @require_flax
 class FlaxLlamaModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
-    all_model_classes = (FlaxLlamaModel,) if is_flax_available() else ()
+    all_model_classes = (FlaxLlamaModel, FlaxLlamaForCausalLM) if is_flax_available() else ()
     all_generative_model_classes = (FlaxLlamaForCausalLM,) if is_flax_available() else ()
 
     def setUp(self):

From ff0818ff74dd4a95e5bec0e85282bf3f7eb9e771 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Wed, 23 Aug 2023 08:14:51 +0100
Subject: [PATCH 30/87] fixing introduced pytest errors from review

---
 src/transformers/models/llama/modeling_flax_llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index cfc60a16925f02..c6f88bfb917f94 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -370,7 +370,7 @@ class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     """
 
     config_class = LlamaConfig
-    base_model_prefix = "transformer"
+    base_model_prefix = "model"
     module_class: nn.Module = None
 
     def __init__(
@@ -562,7 +562,7 @@ def setup(self):
             dtype=self.dtype,
         )
         self.layers = FlaxLlamaLayerCollection(self.config, dtype=self.dtype)
-        self.norm = FlaxLlamaRMSNorm(self.config.rms_norm_eps)
+        self.norm = FlaxLlamaRMSNorm(self.config)
 
     def __call__(
         self,

From d18daad08cf63b32ad0211b915dd1b77d9668a29 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Wed, 23 Aug 2023 14:55:20 +0100
Subject: [PATCH 31/87] wip additional slow tests

---
 .../models/llama/test_modeling_flax_llama.py  | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index d7a41a5d88863d..5d2c0aa4ff3171 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -200,3 +200,28 @@ def test_model_from_pretrained(self):
             model = model_class_name.from_pretrained("openlm-research/open_llama_3b_v2", from_pt=True)
             outputs = model(np.ones((1, 1)))
             self.assertIsNotNone(outputs)
+
+    @slow
+    def test_model_logits(self):
+        model_id = "openlm-research/open_llama_3b_v2"
+        model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        test_batch = [
+            "Aloha, World! ",
+            "2 + 2 = ",
+            "Paris is the capital of ",
+            "我很高興認識"
+        ]
+
+        tokenized_batch = tokenizer(test_batch, padding='max_length', max_length=model.config.max_position_embeddings)
+
+        # TODO: add expected logits here
+        # fmt: off
+        EXPECTED_LOGITS = None
+        # fmt: on
+
+        self.assertTrue(np.allclose())
+
+    @slow
+    def test_generated_text(self):
+        model = FlaxLlamaForCausalLM.from_pretrained("openlm-research/open_llama_3b_f2", from_pt=True)
\ No newline at end of file

From 230abebfbdbdc46d059f611394f3ff7f58a079d8 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 24 Aug 2023 09:06:22 +0100
Subject: [PATCH 32/87] wip tests need to grab a GPU machine to get real logits
 for comparison otherwise, slow tests should be okay

---
 .../models/llama/test_modeling_flax_llama.py  | 56 +++++++++++++++----
 1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 5d2c0aa4ff3171..ae398235ee93b8 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from transformers import LlamaConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import require_flax, slow
+from transformers.testing_utils import require_flax, slow, skip
 
 from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
@@ -28,11 +28,11 @@
     import jax.numpy as jnp
 
     from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
+    from transformers import LlamaTokenizerFast
 
 if is_torch_available():
     pass
 
-
 class FlaxLlamaModelTester:
     def __init__(
         self,
@@ -202,26 +202,58 @@ def test_model_from_pretrained(self):
             self.assertIsNotNone(outputs)
 
     @slow
+    @skip # wip test
     def test_model_logits(self):
         model_id = "openlm-research/open_llama_3b_v2"
         model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        test_batch = [
-            "Aloha, World! ",
-            "2 + 2 = ",
-            "Paris is the capital of ",
-            "我很高興認識"
-        ]
+        test_batch = jnp.arange(32).reshape(4, 8) + 0x777
 
-        tokenized_batch = tokenizer(test_batch, padding='max_length', max_length=model.config.max_position_embeddings)
+        flax_logits = model(test_batch).logits
 
         # TODO: add expected logits here
         # fmt: off
         EXPECTED_LOGITS = None
         # fmt: on
 
-        self.assertTrue(np.allclose())
+        self.assertAlmostEqual(flax_logits, EXPECTED_LOGITS, places=4)
 
     @slow
+    @skip # wip test
+    def test_model_hidden_states(self):
+        model_id = "openlm-research/open_llama_3b_v2"
+        model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
+        test_batch = jnp.arange(32).reshape(4, 8) + 0x777
+
+        flax_hidden_states = model(test_batch).hidden_states
+        # TODO: calculate mean of all hidden states
+        flax_hidden_means = None
+
+        # TODO: add expected logits here
+        # fmt: off
+        EXPECTED_HIDDEN_MEANS = None
+        # fmt: on
+
+        self.assertAlmostEqual(flax_hidden_means, EXPECTED_HIDDEN_MEANS, places=4)
+
+    @slow
+    @skip # wip test
     def test_generated_text(self):
-        model = FlaxLlamaForCausalLM.from_pretrained("openlm-research/open_llama_3b_f2", from_pt=True)
\ No newline at end of file
+        model_id = "openlm-research/open_llama_3b_v2"
+        model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
+
+        tokenizer = LlamaTokenizerFast.from_pretrained(model_id)
+        test_batch = [
+            "Aloha, World! ",
+            "2 + 2 = ",
+            "Paris is the capital of ",
+            "我很高興認識"
+        ] 
+
+        inputs = tokenizer(test_batch, return_tensors='np', padding=True, truncation=True)
+        generated_ids = model.generate(**inputs, max_length=20).sequences
+        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # TODO: add expected outputs
+        EXPECTED_GENERATION = None
+
+        self.assertListEqual(generated_text, EXPECTED_GENERATION)
\ No newline at end of file

From b19213d88da5cac8520f6ecc153e6d849780bde4 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 24 Aug 2023 09:23:24 +0100
Subject: [PATCH 33/87] `make quality`, `make style`

---
 .../models/llama/test_modeling_flax_llama.py  | 26 ++++++++-----------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index ae398235ee93b8..63b3ecf61be112 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from transformers import LlamaConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import require_flax, slow, skip
+from transformers.testing_utils import require_flax, skip, slow
 
 from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
@@ -27,12 +27,13 @@
 if is_flax_available():
     import jax.numpy as jnp
 
-    from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
     from transformers import LlamaTokenizerFast
+    from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
 
 if is_torch_available():
     pass
 
+
 class FlaxLlamaModelTester:
     def __init__(
         self,
@@ -202,7 +203,7 @@ def test_model_from_pretrained(self):
             self.assertIsNotNone(outputs)
 
     @slow
-    @skip # wip test
+    @skip  # wip test
     def test_model_logits(self):
         model_id = "openlm-research/open_llama_3b_v2"
         model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
@@ -218,13 +219,13 @@ def test_model_logits(self):
         self.assertAlmostEqual(flax_logits, EXPECTED_LOGITS, places=4)
 
     @slow
-    @skip # wip test
+    @skip  # wip test
     def test_model_hidden_states(self):
         model_id = "openlm-research/open_llama_3b_v2"
         model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
         test_batch = jnp.arange(32).reshape(4, 8) + 0x777
 
-        flax_hidden_states = model(test_batch).hidden_states
+        model(test_batch).hidden_states
         # TODO: calculate mean of all hidden states
         flax_hidden_means = None
 
@@ -236,24 +237,19 @@ def test_model_hidden_states(self):
         self.assertAlmostEqual(flax_hidden_means, EXPECTED_HIDDEN_MEANS, places=4)
 
     @slow
-    @skip # wip test
+    @skip  # wip test
     def test_generated_text(self):
         model_id = "openlm-research/open_llama_3b_v2"
         model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
 
         tokenizer = LlamaTokenizerFast.from_pretrained(model_id)
-        test_batch = [
-            "Aloha, World! ",
-            "2 + 2 = ",
-            "Paris is the capital of ",
-            "我很高興認識"
-        ] 
-
-        inputs = tokenizer(test_batch, return_tensors='np', padding=True, truncation=True)
+        test_batch = ["Aloha, World! ", "2 + 2 = ", "Paris is the capital of ", "我很高興認識"]
+
+        inputs = tokenizer(test_batch, return_tensors="np", padding=True, truncation=True)
         generated_ids = model.generate(**inputs, max_length=20).sequences
         generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 
         # TODO: add expected outputs
         EXPECTED_GENERATION = None
 
-        self.assertListEqual(generated_text, EXPECTED_GENERATION)
\ No newline at end of file
+        self.assertListEqual(generated_text, EXPECTED_GENERATION)

From 2959abdf2d77699bd6c7e1f1c28d05311dafd867 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Thu, 24 Aug 2023 23:43:01 +0100
Subject: [PATCH 34/87] adds slow integration tests - checking logits -
 checking hidden states - checking generation outputs

---
 .../models/llama/test_modeling_flax_llama.py  | 44 ++++++++++++-------
 1 file changed, 28 insertions(+), 16 deletions(-)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 63b3ecf61be112..78ae952f78d1c5 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -18,7 +18,7 @@
 import numpy as np
 
 from transformers import LlamaConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import require_flax, skip, slow
+from transformers.testing_utils import require_flax, slow
 
 from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
@@ -203,7 +203,6 @@ def test_model_from_pretrained(self):
             self.assertIsNotNone(outputs)
 
     @slow
-    @skip  # wip test
     def test_model_logits(self):
         model_id = "openlm-research/open_llama_3b_v2"
         model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
@@ -211,45 +210,58 @@ def test_model_logits(self):
 
         flax_logits = model(test_batch).logits
 
-        # TODO: add expected logits here
         # fmt: off
-        EXPECTED_LOGITS = None
+        EXPECTED_LOGITS = [-74.4243, -74.0680, -65.2507, -79.1658, -77.7460, -69.2379, -86.4588, -84.8933, -77.8456]
+        EXPECTED_MIN, EXPECTED_MAX, EXPECTED_MEAN = -96.9952, -18.4571, -65.0608
         # fmt: on
 
-        self.assertAlmostEqual(flax_logits, EXPECTED_LOGITS, places=4)
+        self.assertTrue(np.allclose(flax_logits[0, :3, :3].flatten(), EXPECTED_LOGITS, atol=1e-4))
+        self.assertAlmostEqual(flax_logits.min(), EXPECTED_MIN, places=3)
+        self.assertAlmostEqual(flax_logits.max(), EXPECTED_MAX, places=3)
+        self.assertAlmostEqual(flax_logits.mean(), EXPECTED_MEAN, places=3)
 
     @slow
-    @skip  # wip test
     def test_model_hidden_states(self):
         model_id = "openlm-research/open_llama_3b_v2"
         model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
         test_batch = jnp.arange(32).reshape(4, 8) + 0x777
 
-        model(test_batch).hidden_states
-        # TODO: calculate mean of all hidden states
-        flax_hidden_means = None
+        flax_hidden_states = model(test_batch, output_hidden_states=True).hidden_states
+        flax_hidden_means = [h.mean() for h in flax_hidden_states]
 
-        # TODO: add expected logits here
         # fmt: off
-        EXPECTED_HIDDEN_MEANS = None
+        EXPECTED_HIDDEN_MEANS = [
+            -0.00007,-0.00049,-0.00169,-0.00253,-0.00271,
+            -0.00290,-0.00252,0.00230,0.00230,0.00198,
+            0.00196,0.00174,0.00246,0.00205,0.00242,
+            0.00171,0.00092,0.00054,0.00102,0.00024,
+            0.00029,0.00037,-0.00101,-0.00062,-0.00341,-0.00636,-0.00357
+        ]
         # fmt: on
 
-        self.assertAlmostEqual(flax_hidden_means, EXPECTED_HIDDEN_MEANS, places=4)
+        self.assertTrue(np.allclose(flax_hidden_means, EXPECTED_HIDDEN_MEANS, atol=1e-4))
 
     @slow
-    @skip  # wip test
     def test_generated_text(self):
         model_id = "openlm-research/open_llama_3b_v2"
         model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
 
         tokenizer = LlamaTokenizerFast.from_pretrained(model_id)
+        tokenizer.pad_token_id = 2
         test_batch = ["Aloha, World! ", "2 + 2 = ", "Paris is the capital of ", "我很高興認識"]
 
-        inputs = tokenizer(test_batch, return_tensors="np", padding=True, truncation=True)
-        generated_ids = model.generate(**inputs, max_length=20).sequences
+        inputs = tokenizer(test_batch, return_tensors="np", truncation=True, padding=True)
+        generated_ids = model.generate(**inputs, max_length=15).sequences
         generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 
         # TODO: add expected outputs
-        EXPECTED_GENERATION = None
+        # fmt: off
+        EXPECTED_GENERATION = [
+            "Aloha, World! 201",
+            "2 + 2 = 4\n2",
+            "Paris is the capital of Île-",
+            "我很高興認識你，我"
+        ]
+        # fmt: on
 
         self.assertListEqual(generated_text, EXPECTED_GENERATION)

From a5b587b7c424f0b392bcb26769f43391420eabe7 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Thu, 24 Aug 2023 22:51:55 +0100
Subject: [PATCH 35/87] `make fix-copies`

---
 .../models/llama/modeling_flax_llama.py        | 18 +++++-------------
 src/transformers/utils/dummy_flax_objects.py   | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index c6f88bfb917f94..f6e1fbc919c9c1 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -199,7 +199,6 @@ def _merge_heads(self, hidden_states):
 
     # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
     @nn.compact
-    def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
         states from previous steps. This function is slighly adapted from the official Flax repository:
@@ -370,7 +369,7 @@ class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     """
 
     config_class = LlamaConfig
-    base_model_prefix = "model"
+    base_model_prefix = "transformer"
     module_class: nn.Module = None
 
     def __init__(
@@ -393,9 +392,7 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
 
-        random_params = self.module.init(
-            rngs, input_ids, position_ids=position_ids, attention_mask=attention_mask, return_dict=False
-        )["params"]
+        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
 
         if params is not None:
             random_params = flatten_dict(unfreeze(random_params))
@@ -422,16 +419,11 @@ def init_cache(self, batch_size, max_length):
         position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
 
         init_variables = self.module.init(
-            jax.random.PRNGKey(0),
-            input_ids,
-            position_ids=position_ids,
-            attention_mask=attention_mask,
-            return_dict=False,
-            init_cache=True,
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
         )
         return unfreeze(init_variables["cache"])
 
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
     def __call__(
         self,
         input_ids,
@@ -469,7 +461,7 @@ def __call__(
 
         inputs = {"params": params or self.params}
 
-        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxLlamaNeoAttention module
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxLlamaAttention module
         if past_key_values:
             inputs["cache"] = past_key_values
             mutable = ["cache"]
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index 4090e4ff5134e1..670333fb85004e 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -800,6 +800,20 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxLlamaForCausalLM(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxLlamaModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxLongT5ForConditionalGeneration(metaclass=DummyObject):
     _backends = ["flax"]
 

From 852e5e35fb13b37bda35c2721aecbb61334350f1 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Thu, 24 Aug 2023 23:01:33 +0100
Subject: [PATCH 36/87] fix mangled function following `make fix-copies`

---
 src/transformers/models/llama/modeling_flax_llama.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index f6e1fbc919c9c1..05193a93454bad 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -197,8 +197,9 @@ def _split_heads(self, hidden_states):
     def _merge_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
 
-    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
     @nn.compact
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
         """
         This function takes projected key, value states from a single input token and concatenates the states to cached
         states from previous steps. This function is slighly adapted from the official Flax repository:

From fd85d5a3683d8e5a7bb5c582ad12a93d8bffb940 Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Fri, 25 Aug 2023 09:23:24 +0100
Subject: [PATCH 37/87] adds missing type checking imports

---
 src/transformers/models/llama/__init__.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/transformers/models/llama/__init__.py b/src/transformers/models/llama/__init__.py
index 7fb601ebb7b74a..23e6bf8ab2d34b 100644
--- a/src/transformers/models/llama/__init__.py
+++ b/src/transformers/models/llama/__init__.py
@@ -92,6 +92,14 @@
     else:
         from .modeling_llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel
 
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
+
 
 else:
     import sys

From fe5aed22dadd52125afeb9d0fbfb9159c0d6b009 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Wed, 30 Aug 2023 17:02:19 +0100
Subject: [PATCH 38/87] fixes missing parameter checkpoint warning

---
 .../models/llama/modeling_flax_llama.py       | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 05193a93454bad..3034d62615654b 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -165,6 +165,28 @@ def __call__(self, hidden_states):
         return self.weight * jnp.asarray(hidden_states, dtype=input_dtype)
 
 
+class FlaxLlamaRotaryEmbedding(nn.Module):
+    config: LlamaConfig
+
+    def setup(self):
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+        self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim)
+
+        # inv_freq is unused but create it here to avoid mismatch when loading
+        # from pt checkpoint.
+        inv_freq = 1.0 / (10000 ** (jnp.arange(0, head_dim, 2) / head_dim))
+        self.inv_freq = self.param("inv_freq", lambda _: inv_freq)
+
+    def __call__(self, key, query, position_ids):
+        sincos = self.sincos[position_ids]
+        sincos = jnp.split(sincos, 2, axis=-1)
+
+        key = apply_rotary_pos_emb(key, sincos)
+        query = apply_rotary_pos_emb(query, sincos)
+
+        return key, query
+
+
 class FlaxLlamaAttention(nn.Module):
     config: LlamaConfig
     dtype: jnp.dtype = jnp.float32
@@ -189,7 +211,7 @@ def setup(self):
         self.o_proj = dense()
 
         self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
-        self.embed_positions = create_sinusoidal_positions(config.max_position_embeddings, self.head_dim)
+        self.rotary_emb = FlaxLlamaRotaryEmbedding(config)
 
     def _split_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
@@ -247,11 +269,7 @@ def __call__(
         key = self._split_heads(key)
         value = self._split_heads(value)
 
-        sincos = self.embed_positions[position_ids]
-        sincos = jnp.split(sincos, 2, axis=-1)
-
-        key = apply_rotary_pos_emb(key, sincos)
-        query = apply_rotary_pos_emb(query, sincos)
+        key, query = self.rotary_emb(key, query, position_ids)
 
         query_length, key_length = query.shape[1], key.shape[1]
 
@@ -370,7 +388,7 @@ class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     """
 
     config_class = LlamaConfig
-    base_model_prefix = "transformer"
+    base_model_prefix = "model"
     module_class: nn.Module = None
 
     def __init__(
@@ -424,7 +442,7 @@ def init_cache(self, batch_size, max_length):
         )
         return unfreeze(init_variables["cache"])
 
-    @add_start_docstrings_to_model_forward(GPT_NEO_INPUTS_DOCSTRING)
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
     def __call__(
         self,
         input_ids,

From 57b47c6aab2c178237c1050a2b79043786b5bd1e Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 31 Aug 2023 10:46:43 +0100
Subject: [PATCH 39/87] more finegrained 'Copied from' tags avoids issue of
 overwriting `LLAMA_INPUTS_DOCSTRING`

---
 src/transformers/models/llama/modeling_flax_llama.py | 5 ++++-
 tests/models/llama/test_modeling_flax_llama.py       | 1 -
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 3034d62615654b..2e409e2b056bdc 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -380,7 +380,6 @@ def __call__(
         return (hidden_states,) + outputs[1:]
 
 
-# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Llama
 class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -391,6 +390,7 @@ class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     base_model_prefix = "model"
     module_class: nn.Module = None
 
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel.__init__ with GPTNeo->Llama
     def __init__(
         self,
         config: LlamaConfig,
@@ -403,6 +403,7 @@ def __init__(
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
 
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel.init_weights with GPTNeo->Llama
     def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
         # init input tensors
         input_ids = jnp.zeros(input_shape, dtype="i4")
@@ -423,6 +424,7 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
         else:
             return random_params
 
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel.init_cache with GPTNeo->Llama
     def init_cache(self, batch_size, max_length):
         r"""
         Args:
@@ -443,6 +445,7 @@ def init_cache(self, batch_size, max_length):
         return unfreeze(init_variables["cache"])
 
     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel.__call__ with GPTNeo->Llama
     def __call__(
         self,
         input_ids,
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 78ae952f78d1c5..bc9073b1954e41 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -254,7 +254,6 @@ def test_generated_text(self):
         generated_ids = model.generate(**inputs, max_length=15).sequences
         generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 
-        # TODO: add expected outputs
         # fmt: off
         EXPECTED_GENERATION = [
             "Aloha, World! 201",

From b7685594e8cbdec306605fc9e63629fe5ac7a594 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 31 Aug 2023 15:30:50 +0100
Subject: [PATCH 40/87] swaps import guards ??? how did these get swapped
 initially?

---
 src/transformers/models/llama/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llama/__init__.py b/src/transformers/models/llama/__init__.py
index 23e6bf8ab2d34b..969853b5cbc791 100644
--- a/src/transformers/models/llama/__init__.py
+++ b/src/transformers/models/llama/__init__.py
@@ -44,7 +44,7 @@
     _import_structure["tokenization_llama_fast"] = ["LlamaTokenizerFast"]
 
 try:
-    if not is_flax_available():
+    if not is_torch_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass
@@ -57,7 +57,7 @@
     ]
 
 try:
-    if not is_torch_available():
+    if not is_flax_available():
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     pass

From ac3f74fb589323b084839831ef8340ad87c2c7c8 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 31 Aug 2023 16:34:33 +0100
Subject: [PATCH 41/87] removing `inv_freq` again as pytorch version has now
 removed

---
 src/transformers/models/llama/modeling_flax_llama.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 2e409e2b056bdc..e1f1c81c330921 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -172,11 +172,6 @@ def setup(self):
         head_dim = self.config.hidden_size // self.config.num_attention_heads
         self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim)
 
-        # inv_freq is unused but create it here to avoid mismatch when loading
-        # from pt checkpoint.
-        inv_freq = 1.0 / (10000 ** (jnp.arange(0, head_dim, 2) / head_dim))
-        self.inv_freq = self.param("inv_freq", lambda _: inv_freq)
-
     def __call__(self, key, query, position_ids):
         sincos = self.sincos[position_ids]
         sincos = jnp.split(sincos, 2, axis=-1)

From 05cade4716759927be8624fe4f5b8567f5b408e6 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 31 Aug 2023 16:35:37 +0100
Subject: [PATCH 42/87] attempting to get CI to pass

---
 src/transformers/__init__.py                 | 4 ++--
 src/transformers/modeling_flax_utils.py      | 3 +++
 src/transformers/models/llama/__init__.py    | 4 ++--
 src/transformers/utils/dummy_flax_objects.py | 7 +++++++
 4 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0725d6d7d66279..ae30c5c412bcb6 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -4146,7 +4146,7 @@
         ["FlaxGPTNeoForCausalLM", "FlaxGPTNeoModel", "FlaxGPTNeoPreTrainedModel"]
     )
     _import_structure["models.gptj"].extend(["FlaxGPTJForCausalLM", "FlaxGPTJModel", "FlaxGPTJPreTrainedModel"])
-    _import_structure["models.llama"].extend(["FlaxLlamaForCausalLM", "FlaxLlamaModel"])
+    _import_structure["models.llama"].extend(["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"])
     _import_structure["models.longt5"].extend(
         ["FlaxLongT5ForConditionalGeneration", "FlaxLongT5Model", "FlaxLongT5PreTrainedModel"]
     )
@@ -7705,7 +7705,7 @@
         from .models.gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model, FlaxGPT2PreTrainedModel
         from .models.gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel, FlaxGPTNeoPreTrainedModel
         from .models.gptj import FlaxGPTJForCausalLM, FlaxGPTJModel, FlaxGPTJPreTrainedModel
-        from .models.llama import FlaxLlamaForCausalLM, FlaxLlamaModel
+        from .models.llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPretrainedModel
         from .models.longt5 import FlaxLongT5ForConditionalGeneration, FlaxLongT5Model, FlaxLongT5PreTrainedModel
         from .models.marian import FlaxMarianModel, FlaxMarianMTModel, FlaxMarianPreTrainedModel
         from .models.mbart import (
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 9e63cb0cb961e8..3384fa1a5f0e42 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -295,6 +295,9 @@ def params(self, params: Union[Dict, FrozenDict]):
             params = unfreeze(params)
         param_keys = set(flatten_dict(params).keys())
         if len(self.required_params - param_keys) > 0:
+            import ipdb
+
+            ipdb.set_trace()
             raise ValueError(
                 "Some parameters are missing. Make sure that `params` include the following "
                 f"parameters {self.required_params - param_keys}"
diff --git a/src/transformers/models/llama/__init__.py b/src/transformers/models/llama/__init__.py
index 969853b5cbc791..b5e9a60cda6e3c 100644
--- a/src/transformers/models/llama/__init__.py
+++ b/src/transformers/models/llama/__init__.py
@@ -62,7 +62,7 @@
 except OptionalDependencyNotAvailable:
     pass
 else:
-    _import_structure["modeling_flax_llama"] = ["FlaxLlamaForCausalLM", "FlaxLlamaModel"]
+    _import_structure["modeling_flax_llama"] = ["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"]
 
 
 if TYPE_CHECKING:
@@ -98,7 +98,7 @@
     except OptionalDependencyNotAvailable:
         pass
     else:
-        from .modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
+        from .modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPreTrainedModel
 
 
 else:
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index 670333fb85004e..2ddd77b63fc57d 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -814,6 +814,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxLlamaPretrainedModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxLongT5ForConditionalGeneration(metaclass=DummyObject):
     _backends = ["flax"]
 

From 3bf0b8b1db7b66ed5ba562037b20812757749129 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 31 Aug 2023 16:44:04 +0100
Subject: [PATCH 43/87] adds doc entries for llama flax models

---
 docs/source/en/model_doc/llama.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md
index 9f55c425d448f7..982a5202e4a34e 100644
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@@ -112,3 +112,13 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] LlamaForSequenceClassification
     - forward
+
+## FlaxLlamaModel
+
+[[autodoc]] FlaxLlamaModel
+    - __call__
+
+## FlaxLlamaForCausalLM
+
+[[autodoc]] FlaxLlamaForCausalLM
+    - __call__

From 211a72b927d33c39972ed47bdc8890665999c067 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 31 Aug 2023 19:16:31 +0100
Subject: [PATCH 44/87] fixes typo in __init__.py imports

---
 src/transformers/__init__.py                 | 2 +-
 src/transformers/utils/dummy_flax_objects.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ae30c5c412bcb6..10cb63e6708473 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -7705,7 +7705,7 @@
         from .models.gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model, FlaxGPT2PreTrainedModel
         from .models.gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel, FlaxGPTNeoPreTrainedModel
         from .models.gptj import FlaxGPTJForCausalLM, FlaxGPTJModel, FlaxGPTJPreTrainedModel
-        from .models.llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPretrainedModel
+        from .models.llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPreTrainedModel
         from .models.longt5 import FlaxLongT5ForConditionalGeneration, FlaxLongT5Model, FlaxLongT5PreTrainedModel
         from .models.marian import FlaxMarianModel, FlaxMarianMTModel, FlaxMarianPreTrainedModel
         from .models.mbart import (
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index 2ddd77b63fc57d..ecf17e711556cb 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -814,7 +814,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
-class FlaxLlamaPretrainedModel(metaclass=DummyObject):
+class FlaxLlamaPreTrainedModel(metaclass=DummyObject):
     _backends = ["flax"]
 
     def __init__(self, *args, **kwargs):

From 27a75226daf6c2a1633e2cf0078d26440ed87dd7 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 31 Aug 2023 19:42:13 +0100
Subject: [PATCH 45/87] adds back special equivalence tests these come from the
 gpt neo flax tests. there is special behaviour for these models that needs to
 override the common version

---
 .../models/llama/test_modeling_flax_llama.py  | 108 +++++++++++++++++-
 1 file changed, 106 insertions(+), 2 deletions(-)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index bc9073b1954e41..5d8828d75bc9d6 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 
+import tempfile
 import unittest
 
 import numpy as np
 
+import transformers
 from transformers import LlamaConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import require_flax, slow
+from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
 
 from ...generation.test_flax_utils import FlaxGenerationTesterMixin
 from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
@@ -28,10 +30,15 @@
     import jax.numpy as jnp
 
     from transformers import LlamaTokenizerFast
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
     from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
 
+
 if is_torch_available():
-    pass
+    import torch
 
 
 class FlaxLlamaModelTester:
@@ -195,6 +202,103 @@ def test_use_cache_forward_with_attn_mask(self):
                 model_class_name, config, input_ids, attention_mask
             )
 
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                batch_size, seq_length = pt_inputs["input_ids"].shape
+                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+                for batch_idx, start_index in enumerate(rnd_start_indices):
+                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+                pt_model = pt_model_class(config).eval()
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
+                fx_model.params = fx_state
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    pt_model.save_pretrained(tmpdirname)
+                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
+
+                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(
+                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
+                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+    # overwrite from common since `attention_mask` in combination
+    # with `causal_mask` behaves slighly differently
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            with self.subTest(model_class.__name__):
+                # prepare inputs
+                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
+                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
+
+                # load corresponding PyTorch class
+                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
+                pt_model_class = getattr(transformers, pt_model_class_name)
+
+                pt_model = pt_model_class(config).eval()
+                fx_model = model_class(config, dtype=jnp.float32)
+
+                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
+                batch_size, seq_length = pt_inputs["input_ids"].shape
+                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
+                for batch_idx, start_index in enumerate(rnd_start_indices):
+                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
+                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
+                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
+                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
+
+                # make sure weights are tied in PyTorch
+                pt_model.tie_weights()
+
+                with torch.no_grad():
+                    pt_outputs = pt_model(**pt_inputs).to_tuple()
+
+                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
+                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
+                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    fx_model.save_pretrained(tmpdirname)
+                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
+
+                with torch.no_grad():
+                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
+
+                self.assertEqual(
+                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
+                )
+                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
+                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
+
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:

From 67f300cb7a7701b1254fe812086682292b5bb753 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Thu, 31 Aug 2023 20:35:52 +0100
Subject: [PATCH 46/87] overrides tests with dummy to see if CI passes need to
 fill in these tests later

---
 tests/models/llama/test_modeling_llama.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 4d6b363e4a75d4..13c61228eb3daa 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -14,9 +14,11 @@
 # limitations under the License.
 """ Testing suite for the PyTorch LLaMA model. """
 
-
+import tempfile
 import unittest
 
+import numpy as np
+
 from parameterized import parameterized
 from pytest import mark
 
@@ -35,6 +37,14 @@
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
+if is_flax_available():
+    import jax.numpy as jnp
+
+    from transformers.modeling_flax_pytorch_utils import (
+        convert_pytorch_state_dict_to_flax,
+        load_flax_weights_in_pytorch_model,
+    )
+    from .test_modeling_flax_llama import FlaxLlamaModelTest
 
 if is_torch_available():
     import torch
@@ -302,6 +312,16 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    # TODO: replace with better test
+    @is_pt_flax_cross_test
+    def test_equivalence_pt_to_flax(self):
+        return
+
+    # TODO: replace with better test
+    @is_pt_flax_cross_test
+    def test_equivalence_flax_to_pt(self):
+        return
+
     def test_model_various_embeddings(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         for type in ["absolute", "relative_key", "relative_key_query"]:

From 2ec5c208107eb965354a020e3b37260bd7203aef Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Fri, 1 Sep 2023 09:48:23 +0100
Subject: [PATCH 47/87] adds my contribution to docs

---
 docs/source/en/model_doc/llama.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md
index 982a5202e4a34e..1a801539896286 100644
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@@ -50,6 +50,9 @@ come in several checkpoints they each contain a part of each weight of the model
 
 - The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
 
+This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama). The Flax version of the implementation was contributed by [afmck](https://huggingface.co/afmck) with the code in the implementation based on Hugging Face's GPT-Neo.
+
+
 Based on the original LLaMA model, Meta AI has released some follow-up works:
 
 - **Llama2**: Llama2 is an improved version of Llama with some architectural tweaks (Grouped Query Attention), and is pre-trained on 2Trillion tokens. Refer to the documentation of Llama2 which can be found [here](llama2).

From 609a113c937bd418fa27ab1e7543f650a28cf91c Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Fri, 1 Sep 2023 14:47:45 +0100
Subject: [PATCH 48/87] `make style; make quality`

---
 tests/models/llama/test_modeling_llama.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 13c61228eb3daa..7b59380977e2c4 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -14,11 +14,8 @@
 # limitations under the License.
 """ Testing suite for the PyTorch LLaMA model. """
 
-import tempfile
 import unittest
 
-import numpy as np
-
 from parameterized import parameterized
 from pytest import mark
 
@@ -37,14 +34,10 @@
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
+
 if is_flax_available():
-    import jax.numpy as jnp
 
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
-    from .test_modeling_flax_llama import FlaxLlamaModelTest
+    pass
 
 if is_torch_available():
     import torch

From 224f546297ba7e9e3dddb7ee56c3a987515b31c9 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alexmc@graphcore.ai>
Date: Fri, 1 Sep 2023 23:00:42 +0100
Subject: [PATCH 49/87] replaces random masking with fixed to work with flax
 version

---
 tests/models/llama/test_modeling_llama.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 7b59380977e2c4..137699e1107c31 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -107,7 +107,8 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            input_mask = torch.tril(torch.ones(7,7))
+            input_mask = torch.nn.functional.pad(input_mask, (0, 0, 0, self.batch_size - self.seq_length), value=1)
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -305,16 +306,6 @@ def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
-    # TODO: replace with better test
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        return
-
-    # TODO: replace with better test
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        return
-
     def test_model_various_embeddings(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         for type in ["absolute", "relative_key", "relative_key_query"]:

From 7de8b586cc3e7e7ccac0d93b1e753cde2daa8a48 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 24 Sep 2023 09:34:57 +0100
Subject: [PATCH 50/87] `make quality; make style`

---
 tests/models/llama/test_modeling_llama.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 137699e1107c31..2b325ad8b7190f 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -31,12 +31,11 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_flax_available():
-
     pass
 
 if is_torch_available():
@@ -107,7 +106,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(7,7))
+            input_mask = torch.tril(torch.ones(7, 7))
             input_mask = torch.nn.functional.pad(input_mask, (0, 0, 0, self.batch_size - self.seq_length), value=1)
 
         token_type_ids = None

From 20b57671d9476edb1132074194e2e22cfdd002b2 Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Thu, 7 Sep 2023 11:20:51 +0100
Subject: [PATCH 51/87] Update
 src/transformers/models/llama/modeling_flax_llama.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/llama/modeling_flax_llama.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index e1f1c81c330921..278dec4cb56559 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -135,7 +135,9 @@ def create_sinusoidal_positions(num_pos, dim):
     return jnp.array(out[:, :, :num_pos])  # TODO: don't think slice is needed
 
 
-def rotate_half(x):
+def rotate_half(tensor):
+    """Rotates half the hidden dims of the input."""
+    rotate_half_tensor = jnp.concatenate((-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1)
     """Rotates half the hidden dims of the input."""
     rotate_half_tensor = jnp.concatenate((-x[..., x.shape[-1] // 2 :], x[..., : x.shape[-1] // 2]), axis=-1)
     return rotate_half_tensor

From ac4183c33cb8e7212ad343f1cdcd95bbd9ca9165 Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Thu, 7 Sep 2023 11:21:34 +0100
Subject: [PATCH 52/87] Update
 src/transformers/models/llama/modeling_flax_llama.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/llama/modeling_flax_llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 278dec4cb56559..400aa5ff2ea125 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -377,6 +377,7 @@ def __call__(
         return (hidden_states,) + outputs[1:]
 
 
+# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Llama, GPT_NEO->LLAMA, transformer->model
 class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained

From bd5451a877ca357f60418b1ca531442f4c0e371f Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Thu, 7 Sep 2023 11:21:52 +0100
Subject: [PATCH 53/87] Update
 src/transformers/models/llama/modeling_flax_llama.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/llama/modeling_flax_llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 400aa5ff2ea125..80aa41f75e70e3 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -388,7 +388,6 @@ class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
     base_model_prefix = "model"
     module_class: nn.Module = None
 
-    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel.__init__ with GPTNeo->Llama
     def __init__(
         self,
         config: LlamaConfig,

From c997a385cd8629f67ba07335044c78db9954399d Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Thu, 7 Sep 2023 11:22:15 +0100
Subject: [PATCH 54/87] Update
 src/transformers/models/llama/modeling_flax_llama.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/llama/modeling_flax_llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 80aa41f75e70e3..1ffd5acf555f3e 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -400,7 +400,6 @@ def __init__(
         module = self.module_class(config=config, dtype=dtype, **kwargs)
         super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
 
-    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel.init_weights with GPTNeo->Llama
     def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
         # init input tensors
         input_ids = jnp.zeros(input_shape, dtype="i4")

From 5019d4ce2580b2be1357982449b8f8421936e0e4 Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Thu, 7 Sep 2023 11:22:33 +0100
Subject: [PATCH 55/87] Update
 src/transformers/models/llama/modeling_flax_llama.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/llama/modeling_flax_llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 1ffd5acf555f3e..4f365ba10caaee 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -420,7 +420,6 @@ def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: Froz
         else:
             return random_params
 
-    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel.init_cache with GPTNeo->Llama
     def init_cache(self, batch_size, max_length):
         r"""
         Args:

From 4df77308d7ae5eec74f26667cc566c4de6a27ef6 Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Thu, 7 Sep 2023 11:22:54 +0100
Subject: [PATCH 56/87] Update
 src/transformers/models/llama/modeling_flax_llama.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/llama/modeling_flax_llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 4f365ba10caaee..e443513c8a61b0 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -440,7 +440,6 @@ def init_cache(self, batch_size, max_length):
         return unfreeze(init_variables["cache"])
 
     @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel.__call__ with GPTNeo->Llama
     def __call__(
         self,
         input_ids,

From f8ccb05e7747fb1434167028e32226315befd1d8 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 24 Sep 2023 08:13:57 +0100
Subject: [PATCH 57/87] updates `x`->`tensor` in `rotate_half`

---
 src/transformers/models/llama/modeling_flax_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index e443513c8a61b0..4489a3a5eec225 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -139,7 +139,7 @@ def rotate_half(tensor):
     """Rotates half the hidden dims of the input."""
     rotate_half_tensor = jnp.concatenate((-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1)
     """Rotates half the hidden dims of the input."""
-    rotate_half_tensor = jnp.concatenate((-x[..., x.shape[-1] // 2 :], x[..., : x.shape[-1] // 2]), axis=-1)
+    rotate_half_tensor = jnp.concatenate((-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1)
     return rotate_half_tensor
 
 

From b01cb70428731992d6781fd2548717013132006e Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 24 Sep 2023 08:33:21 +0100
Subject: [PATCH 58/87] addresses smaller review comments

---
 src/transformers/modeling_flax_utils.py       |   3 -
 .../models/llama/modeling_flax_llama.py       |  10 +-
 .../models/llama/test_modeling_flax_llama.py  | 105 +-----------------
 tests/models/llama/test_modeling_llama.py     |   7 +-
 4 files changed, 14 insertions(+), 111 deletions(-)

diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 3384fa1a5f0e42..9e63cb0cb961e8 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -295,9 +295,6 @@ def params(self, params: Union[Dict, FrozenDict]):
             params = unfreeze(params)
         param_keys = set(flatten_dict(params).keys())
         if len(self.required_params - param_keys) > 0:
-            import ipdb
-
-            ipdb.set_trace()
             raise ValueError(
                 "Some parameters are missing. Make sure that `params` include the following "
                 f"parameters {self.required_params - param_keys}"
diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 4489a3a5eec225..1a27e4d7ca4a42 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -132,14 +132,18 @@ def create_sinusoidal_positions(num_pos, dim):
 
     emb = np.concatenate((freqs, freqs), axis=-1)
     out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1)
-    return jnp.array(out[:, :, :num_pos])  # TODO: don't think slice is needed
+    return jnp.array(out[:, :, :num_pos])
 
 
 def rotate_half(tensor):
     """Rotates half the hidden dims of the input."""
-    rotate_half_tensor = jnp.concatenate((-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1)
+    rotate_half_tensor = jnp.concatenate(
+        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
+    )
     """Rotates half the hidden dims of the input."""
-    rotate_half_tensor = jnp.concatenate((-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1)
+    rotate_half_tensor = jnp.concatenate(
+        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
+    )
     return rotate_half_tensor
 
 
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 5d8828d75bc9d6..a3a719101a4b42 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -93,7 +93,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            input_mask = np.tril(np.ones((self.batch_size, self.seq_length)))
 
         config = LlamaConfig(
             vocab_size=self.vocab_size,
@@ -202,103 +202,6 @@ def test_use_cache_forward_with_attn_mask(self):
                 model_class_name, config, input_ids, attention_mask
             )
 
-    @is_pt_flax_cross_test
-    def test_equivalence_pt_to_flax(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                fx_state = convert_pytorch_state_dict_to_flax(pt_model.state_dict(), fx_model)
-                fx_model.params = fx_state
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    pt_model.save_pretrained(tmpdirname)
-                    fx_model_loaded = model_class.from_pretrained(tmpdirname, from_pt=True)
-
-                fx_outputs_loaded = fx_model_loaded(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(
-                    len(fx_outputs_loaded), len(pt_outputs), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output_loaded, pt_output in zip(fx_outputs_loaded, pt_outputs):
-                    self.assert_almost_equals(fx_output_loaded[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-    # overwrite from common since `attention_mask` in combination
-    # with `causal_mask` behaves slighly differently
-    @is_pt_flax_cross_test
-    def test_equivalence_flax_to_pt(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            with self.subTest(model_class.__name__):
-                # prepare inputs
-                prepared_inputs_dict = self._prepare_for_class(inputs_dict, model_class)
-                pt_inputs = {k: torch.tensor(v.tolist()) for k, v in prepared_inputs_dict.items()}
-
-                # load corresponding PyTorch class
-                pt_model_class_name = model_class.__name__[4:]  # Skip the "Flax" at the beginning
-                pt_model_class = getattr(transformers, pt_model_class_name)
-
-                pt_model = pt_model_class(config).eval()
-                fx_model = model_class(config, dtype=jnp.float32)
-
-                pt_model = load_flax_weights_in_pytorch_model(pt_model, fx_model.params)
-                batch_size, seq_length = pt_inputs["input_ids"].shape
-                rnd_start_indices = np.random.randint(0, seq_length - 1, size=(batch_size,))
-                for batch_idx, start_index in enumerate(rnd_start_indices):
-                    pt_inputs["attention_mask"][batch_idx, :start_index] = 0
-                    pt_inputs["attention_mask"][batch_idx, start_index:] = 1
-                    prepared_inputs_dict["attention_mask"][batch_idx, :start_index] = 0
-                    prepared_inputs_dict["attention_mask"][batch_idx, start_index:] = 1
-
-                # make sure weights are tied in PyTorch
-                pt_model.tie_weights()
-
-                with torch.no_grad():
-                    pt_outputs = pt_model(**pt_inputs).to_tuple()
-
-                fx_outputs = fx_model(**prepared_inputs_dict).to_tuple()
-                self.assertEqual(len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch")
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
-                with tempfile.TemporaryDirectory() as tmpdirname:
-                    fx_model.save_pretrained(tmpdirname)
-                    pt_model_loaded = pt_model_class.from_pretrained(tmpdirname, from_flax=True)
-
-                with torch.no_grad():
-                    pt_outputs_loaded = pt_model_loaded(**pt_inputs).to_tuple()
-
-                self.assertEqual(
-                    len(fx_outputs), len(pt_outputs_loaded), "Output lengths differ between Flax and PyTorch"
-                )
-                for fx_output, pt_output in zip(fx_outputs, pt_outputs_loaded):
-                    self.assert_almost_equals(fx_output[:, -1], pt_output[:, -1].numpy(), 4e-2)
-
     @slow
     def test_model_from_pretrained(self):
         for model_class_name in self.all_model_classes:
@@ -310,13 +213,15 @@ def test_model_from_pretrained(self):
     def test_model_logits(self):
         model_id = "openlm-research/open_llama_3b_v2"
         model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
-        test_batch = jnp.arange(32).reshape(4, 8) + 0x777
+        test_batch = jnp.arange(32).reshape(4, 8) + 1911
 
         flax_logits = model(test_batch).logits
 
         # fmt: off
         EXPECTED_LOGITS = [-74.4243, -74.0680, -65.2507, -79.1658, -77.7460, -69.2379, -86.4588, -84.8933, -77.8456]
-        EXPECTED_MIN, EXPECTED_MAX, EXPECTED_MEAN = -96.9952, -18.4571, -65.0608
+        EXPECTED_MIN, EXPECTED_MAX, EXPECTED_MEAN = -96.9952
+        EXPECTED_MAX = -18.4571
+        EXPECTED_MEAN = -65.0608
         # fmt: on
 
         self.assertTrue(np.allclose(flax_logits[0, :3, :3].flatten(), EXPECTED_LOGITS, atol=1e-4))
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 2b325ad8b7190f..9b0b4fc4e8c9fc 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -35,9 +35,6 @@
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
-if is_flax_available():
-    pass
-
 if is_torch_available():
     import torch
 
@@ -106,8 +103,8 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(7, 7))
-            input_mask = torch.nn.functional.pad(input_mask, (0, 0, 0, self.batch_size - self.seq_length), value=1)
+            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length))
+            # input_mask = torch.nn.functional.pad(input_mask, (0, 0, 0, self.batch_size - self.seq_length), value=1)
 
         token_type_ids = None
         if self.use_token_type_ids:

From 6848c63bff6793986c9cbb5a8fde8499d50c7504 Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Sun, 24 Sep 2023 08:34:53 +0100
Subject: [PATCH 59/87] Update docs/source/en/model_doc/llama.md

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 docs/source/en/model_doc/llama.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md
index 1a801539896286..96f2a2e7eb7cdd 100644
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@@ -50,7 +50,7 @@ come in several checkpoints they each contain a part of each weight of the model
 
 - The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
 
-This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama). The Flax version of the implementation was contributed by [afmck](https://huggingface.co/afmck) with the code in the implementation based on Hugging Face's GPT-Neo.
+This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama). The Flax version of the implementation was contributed by [afmck](https://huggingface.co/afmck) with the code in the implementation based on Hugging Face's Flax GPT-Neo.
 
 
 Based on the original LLaMA model, Meta AI has released some follow-up works:

From 9994b91cef5786681673b1b76e96f76b573a1ef2 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 24 Sep 2023 08:43:31 +0100
Subject: [PATCH 60/87] adds integration test class

---
 .../models/llama/test_modeling_flax_llama.py  | 30 ++++++++-----------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index a3a719101a4b42..e2b2f9e5b9c53b 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -209,13 +209,16 @@ def test_model_from_pretrained(self):
             outputs = model(np.ones((1, 1)))
             self.assertIsNotNone(outputs)
 
-    @slow
-    def test_model_logits(self):
-        model_id = "openlm-research/open_llama_3b_v2"
-        model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
-        test_batch = jnp.arange(32).reshape(4, 8) + 1911
+@slow
+@require_flax
+class FlaxLlamaIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.model_id = "openlm-research/open_llama_3b_v2"
+        self.model = FlaxLlamaForCausalLM.from_pretrained(self.model_id, from_pt=True)
+        self.test_batch = jnp.arange(32).reshape(4, 8) + 1911
 
-        flax_logits = model(test_batch).logits
+    def test_model_logits(self):
+        flax_logits = self.model(self.test_batch).logits
 
         # fmt: off
         EXPECTED_LOGITS = [-74.4243, -74.0680, -65.2507, -79.1658, -77.7460, -69.2379, -86.4588, -84.8933, -77.8456]
@@ -229,13 +232,8 @@ def test_model_logits(self):
         self.assertAlmostEqual(flax_logits.max(), EXPECTED_MAX, places=3)
         self.assertAlmostEqual(flax_logits.mean(), EXPECTED_MEAN, places=3)
 
-    @slow
     def test_model_hidden_states(self):
-        model_id = "openlm-research/open_llama_3b_v2"
-        model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
-        test_batch = jnp.arange(32).reshape(4, 8) + 0x777
-
-        flax_hidden_states = model(test_batch, output_hidden_states=True).hidden_states
+        flax_hidden_states = self.model(self.test_batch, output_hidden_states=True).hidden_states
         flax_hidden_means = [h.mean() for h in flax_hidden_states]
 
         # fmt: off
@@ -250,17 +248,13 @@ def test_model_hidden_states(self):
 
         self.assertTrue(np.allclose(flax_hidden_means, EXPECTED_HIDDEN_MEANS, atol=1e-4))
 
-    @slow
     def test_generated_text(self):
-        model_id = "openlm-research/open_llama_3b_v2"
-        model = FlaxLlamaForCausalLM.from_pretrained(model_id, from_pt=True)
-
-        tokenizer = LlamaTokenizerFast.from_pretrained(model_id)
+        tokenizer = LlamaTokenizerFast.from_pretrained(self.model_id)
         tokenizer.pad_token_id = 2
         test_batch = ["Aloha, World! ", "2 + 2 = ", "Paris is the capital of ", "我很高興認識"]
 
         inputs = tokenizer(test_batch, return_tensors="np", truncation=True, padding=True)
-        generated_ids = model.generate(**inputs, max_length=15).sequences
+        generated_ids = self.model.generate(**inputs, max_length=15).sequences
         generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 
         # fmt: off

From d248925aaeeae431c8c8217601a1b3fa7f626107 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 24 Sep 2023 09:13:26 +0100
Subject: [PATCH 61/87] adds `dtype` to rotary embedding to cast outputs

---
 src/transformers/models/llama/modeling_flax_llama.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 1a27e4d7ca4a42..80e11d91cde5cc 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -173,6 +173,7 @@ def __call__(self, hidden_states):
 
 class FlaxLlamaRotaryEmbedding(nn.Module):
     config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
 
     def setup(self):
         head_dim = self.config.hidden_size // self.config.num_attention_heads
@@ -185,6 +186,9 @@ def __call__(self, key, query, position_ids):
         key = apply_rotary_pos_emb(key, sincos)
         query = apply_rotary_pos_emb(query, sincos)
 
+        key = jnp.asarray(key, dtype=self.dtype)
+        query = jnp.asarray(query, dtype=self.dtype)
+
         return key, query
 
 

From f1fc40a3c0db08ace29d45dc1425ee23cfd19f3c Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 24 Sep 2023 09:20:53 +0100
Subject: [PATCH 62/87] adds type to flax llama rotary layer

---
 src/transformers/models/llama/modeling_flax_llama.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 80e11d91cde5cc..6fbf49d77c3d02 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -154,13 +154,13 @@ def apply_rotary_pos_emb(tensor, sincos):
 
 class FlaxLlamaRMSNorm(nn.Module):
     config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
 
     def setup(self):
         self.epsilon = self.config.rms_norm_eps
         self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size)
 
     def __call__(self, hidden_states):
-        input_dtype = hidden_states.dtype
         variance = jnp.asarray(hidden_states, dtype=jnp.float32)
         variance = jnp.power(variance, 2)
         variance = variance.mean(-1, keepdims=True)
@@ -168,7 +168,7 @@ def __call__(self, hidden_states):
         # hidden_states = hidden_states * jax.lax.rsqrt(variance + self.epsilon)
         hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon)
 
-        return self.weight * jnp.asarray(hidden_states, dtype=input_dtype)
+        return self.weight * jnp.asarray(hidden_states, dtype=self.dtype)
 
 
 class FlaxLlamaRotaryEmbedding(nn.Module):
@@ -216,7 +216,7 @@ def setup(self):
         self.o_proj = dense()
 
         self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
-        self.rotary_emb = FlaxLlamaRotaryEmbedding(config)
+        self.rotary_emb = FlaxLlamaRotaryEmbedding(config, dtype=self.dtype)
 
     def _split_heads(self, hidden_states):
         return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
@@ -348,9 +348,9 @@ class FlaxLlamaDecoderLayer(nn.Module):
     dtype: jnp.dtype = jnp.float32
 
     def setup(self):
-        self.input_layernorm = FlaxLlamaRMSNorm(self.config)
+        self.input_layernorm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
         self.self_attn = FlaxLlamaAttention(self.config, dtype=self.dtype)
-        self.post_attention_layernorm = FlaxLlamaRMSNorm(self.config)
+        self.post_attention_layernorm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
         self.mlp = FlaxLlamaMLP(self.config, dtype=self.dtype)
 
     def __call__(

From 1f7cb9b8d845c956e8693ec66c5654546ecc57c5 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 24 Sep 2023 09:22:44 +0100
Subject: [PATCH 63/87] `make style`

---
 tests/models/llama/test_modeling_flax_llama.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index e2b2f9e5b9c53b..0341358085bca0 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -13,32 +13,26 @@
 # limitations under the License.
 
 
-import tempfile
 import unittest
 
 import numpy as np
 
-import transformers
 from transformers import LlamaConfig, is_flax_available, is_torch_available
-from transformers.testing_utils import is_pt_flax_cross_test, require_flax, slow
+from transformers.testing_utils import require_flax, slow
 
 from ...generation.test_flax_utils import FlaxGenerationTesterMixin
-from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
 
 
 if is_flax_available():
     import jax.numpy as jnp
 
     from transformers import LlamaTokenizerFast
-    from transformers.modeling_flax_pytorch_utils import (
-        convert_pytorch_state_dict_to_flax,
-        load_flax_weights_in_pytorch_model,
-    )
     from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
 
 
 if is_torch_available():
-    import torch
+    pass
 
 
 class FlaxLlamaModelTester:
@@ -209,6 +203,7 @@ def test_model_from_pretrained(self):
             outputs = model(np.ones((1, 1)))
             self.assertIsNotNone(outputs)
 
+
 @slow
 @require_flax
 class FlaxLlamaIntegrationTest(unittest.TestCase):

From be7be9153dac972db5520257b055a9c7a460278f Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 24 Sep 2023 09:50:58 +0100
Subject: [PATCH 64/87] `make fix-copies`

---
 docs/source/en/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 8aa372391d19bd..036c0bb4db253b 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+        <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

From 3a7a3ae50cb922f7b1170318c2d5b27da74e8ddb Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Sat, 30 Sep 2023 13:34:41 +0100
Subject: [PATCH 65/87] Apply suggestions from code review

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/llama/modeling_flax_llama.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 6fbf49d77c3d02..4fd279c6498507 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+# Copyright 2023 Meta AI, EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+"""Flax LLaMA model."""
 from functools import partial
 from typing import Optional, Tuple
 
@@ -140,7 +140,6 @@ def rotate_half(tensor):
     rotate_half_tensor = jnp.concatenate(
         (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
     )
-    """Rotates half the hidden dims of the input."""
     rotate_half_tensor = jnp.concatenate(
         (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
     )
@@ -165,7 +164,6 @@ def __call__(self, hidden_states):
         variance = jnp.power(variance, 2)
         variance = variance.mean(-1, keepdims=True)
         # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt`
-        # hidden_states = hidden_states * jax.lax.rsqrt(variance + self.epsilon)
         hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon)
 
         return self.weight * jnp.asarray(hidden_states, dtype=self.dtype)
@@ -591,7 +589,7 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-        input_embeds = self.embed_tokens(input_ids.astype("i4"))
+        input_embeds = self.embed_tokens(input_ids)
 
         outputs = self.layers(
             input_embeds,
@@ -684,6 +682,7 @@ def __call__(
     """,
     LLAMA_START_DOCSTRING,
 )
+# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxLlamaForCausalLM with GPTJ->Llama
 class FlaxLlamaForCausalLM(FlaxLlamaPreTrainedModel):
     module_class = FlaxLlamaForCausalLMModule
 

From 20e6b35abd4c22fe69829da7a6b068a72239497f Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sat, 30 Sep 2023 13:45:53 +0100
Subject: [PATCH 66/87] applies suggestions from review

---
 .../models/llama/modeling_flax_llama.py            | 14 ++++++++------
 tests/models/llama/test_modeling_flax_llama.py     |  8 ++------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 4fd279c6498507..3a9bb36a13ab82 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -328,16 +328,18 @@ def setup(self):
         embed_dim = self.config.hidden_size
         inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim
 
-        jax.nn.initializers.normal(self.config.initializer_range)
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
         self.act = ACT2FN[self.config.hidden_act]
 
-        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype)
-        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype)
-        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype)
+        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
 
     def __call__(self, hidden_states):
-        hidden_states = self.up_proj(hidden_states) * self.act(self.gate_proj(hidden_states))
-        hidden_states = self.down_proj(hidden_states)
+        hidden_states = self.up_proj(hidden_states)
+        gate_states = self.act(self.gate_proj(hidden_states))
+
+        hidden_states = self.down_proj(hidden_states * gate_states)
         return hidden_states
 
 
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 0341358085bca0..b67b1fb3a72697 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 
-from transformers import LlamaConfig, is_flax_available, is_torch_available
+from transformers import LlamaConfig, is_flax_available
 from transformers.testing_utils import require_flax, slow
 
 from ...generation.test_flax_utils import FlaxGenerationTesterMixin
@@ -31,10 +31,6 @@
     from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
 
 
-if is_torch_available():
-    pass
-
-
 class FlaxLlamaModelTester:
     def __init__(
         self,
@@ -99,7 +95,7 @@ def prepare_config_and_inputs(self):
             hidden_dropout_prob=self.hidden_dropout_prob,
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             max_position_embeddings=self.max_position_embeddings,
-            use_cache=False,
+            use_cache=True,
             is_decoder=False,
             initializer_range=self.initializer_range,
         )

From 3da6a6ad2279ca0450574abf36f62c8427444afb Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Mon, 2 Oct 2023 11:45:33 +0100
Subject: [PATCH 67/87] Update modeling_flax_llama.py

---
 src/transformers/models/llama/modeling_flax_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 3a9bb36a13ab82..df45f25e73ce35 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -684,7 +684,7 @@ def __call__(
     """,
     LLAMA_START_DOCSTRING,
 )
-# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxLlamaForCausalLM with GPTJ->Llama
+# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Llama
 class FlaxLlamaForCausalLM(FlaxLlamaPreTrainedModel):
     module_class = FlaxLlamaForCausalLMModule
 

From 5f5ca1d77206c3f4d3d35be2e4c4db9930456440 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Mon, 2 Oct 2023 13:35:31 +0100
Subject: [PATCH 68/87] `make fix-copies`

---
 src/transformers/models/llama/modeling_flax_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index df45f25e73ce35..8995213e750c9e 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -688,7 +688,7 @@ def __call__(
 class FlaxLlamaForCausalLM(FlaxLlamaPreTrainedModel):
     module_class = FlaxLlamaForCausalLMModule
 
-    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None):
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
         # initializing the cache
         batch_size, seq_length = input_ids.shape
 

From 9166130c28a2c3ec719e38e4c97661cf17faa8ce Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Mon, 2 Oct 2023 19:32:36 +0100
Subject: [PATCH 69/87] Update tests/models/llama/test_modeling_llama.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 tests/models/llama/test_modeling_llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 9b0b4fc4e8c9fc..8490784db15dbb 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -104,7 +104,6 @@ def prepare_config_and_inputs(self):
         input_mask = None
         if self.use_input_mask:
             input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length))
-            # input_mask = torch.nn.functional.pad(input_mask, (0, 0, 0, self.batch_size - self.seq_length), value=1)
 
         token_type_ids = None
         if self.use_token_type_ids:

From d8570b48ba5e0312fa73bea28d5b68e50f460848 Mon Sep 17 00:00:00 2001
From: Alex McKinney <44398246+vvvm23@users.noreply.github.com>
Date: Mon, 2 Oct 2023 19:32:52 +0100
Subject: [PATCH 70/87] Update
 src/transformers/models/llama/modeling_flax_llama.py

Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
---
 src/transformers/models/llama/modeling_flax_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 8995213e750c9e..fcf4f5fadc76c0 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -591,7 +591,7 @@ def __call__(
         output_hidden_states: bool = False,
         return_dict: bool = True,
     ):
-        input_embeds = self.embed_tokens(input_ids)
+        input_embeds = self.embed_tokens(input_ids.astype("i4"))
 
         outputs = self.layers(
             input_embeds,

From 6d7a930943f77bcaba957fd732d9feadf5928cb9 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 3 Oct 2023 09:28:15 +0100
Subject: [PATCH 71/87] fixes shape mismatch in FlaxLlamaMLP

---
 src/transformers/models/llama/modeling_flax_llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index fcf4f5fadc76c0..c5a09a8907a8a7 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -336,10 +336,10 @@ def setup(self):
         self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
 
     def __call__(self, hidden_states):
-        hidden_states = self.up_proj(hidden_states)
+        up_proj_states = self.up_proj(hidden_states)
         gate_states = self.act(self.gate_proj(hidden_states))
 
-        hidden_states = self.down_proj(hidden_states * gate_states)
+        hidden_states = self.down_proj(up_proj_states * gate_states)
         return hidden_states
 
 

From 39b55f8dd9f3e805ac5c2476b827a3ac01a9bce0 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sat, 7 Oct 2023 12:50:50 +0100
Subject: [PATCH 72/87] applies some suggestions from reviews

---
 .../models/llama/modeling_flax_llama.py            | 14 +++++---------
 tests/models/llama/test_modeling_flax_llama.py     |  9 +++++----
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index c5a09a8907a8a7..a37f0840862be7 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -140,14 +140,10 @@ def rotate_half(tensor):
     rotate_half_tensor = jnp.concatenate(
         (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
     )
-    rotate_half_tensor = jnp.concatenate(
-        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
-    )
     return rotate_half_tensor
 
 
-def apply_rotary_pos_emb(tensor, sincos):
-    sin_pos, cos_pos = sincos
+def apply_rotary_pos_emb(tensor, sin_pos, cos_pos):
     return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
 
 
@@ -179,10 +175,10 @@ def setup(self):
 
     def __call__(self, key, query, position_ids):
         sincos = self.sincos[position_ids]
-        sincos = jnp.split(sincos, 2, axis=-1)
+        sin_pos, cos_pos = jnp.split(sincos, 2, axis=-1)
 
-        key = apply_rotary_pos_emb(key, sincos)
-        query = apply_rotary_pos_emb(query, sincos)
+        key = apply_rotary_pos_emb(key, sin_pos, cos_pos)
+        query = apply_rotary_pos_emb(query, sin_pos, cos_pos)
 
         key = jnp.asarray(key, dtype=self.dtype)
         query = jnp.asarray(query, dtype=self.dtype)
@@ -578,7 +574,7 @@ def setup(self):
             dtype=self.dtype,
         )
         self.layers = FlaxLlamaLayerCollection(self.config, dtype=self.dtype)
-        self.norm = FlaxLlamaRMSNorm(self.config)
+        self.norm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
 
     def __call__(
         self,
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index b67b1fb3a72697..a81398786b43b6 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 
-from transformers import LlamaConfig, is_flax_available
+from transformers import LlamaConfig, is_flax_available, is_tokenizers_available
 from transformers.testing_utils import require_flax, slow
 
 from ...generation.test_flax_utils import FlaxGenerationTesterMixin
@@ -27,10 +27,13 @@
 if is_flax_available():
     import jax.numpy as jnp
 
-    from transformers import LlamaTokenizerFast
     from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
 
 
+if is_tokenizers_available():
+    from transformers import LlamaTokenizerFast
+
+
 class FlaxLlamaModelTester:
     def __init__(
         self,
@@ -45,7 +48,6 @@ def __init__(
         hidden_size=16,
         num_hidden_layers=2,
         num_attention_heads=2,
-        attention_types=[[["global", "local"], 2]],
         intermediate_size=64,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
@@ -65,7 +67,6 @@ def __init__(
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.attention_types = attention_types
         self.intermediate_size = intermediate_size
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob

From a6d8c0675ac47b9518c94bc5d3b87a8cfbfbac84 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Mon, 16 Oct 2023 22:00:52 +0100
Subject: [PATCH 73/87] casts attn output logits to f32 regardless of dtype

---
 src/transformers/models/llama/modeling_flax_llama.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index a37f0840862be7..8f90c48c29069b 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -197,6 +197,7 @@ def setup(self):
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.embed_dim // self.num_heads
+        self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
 
         dense = partial(
             nn.Dense,
@@ -300,14 +301,18 @@ def __call__(
         )
 
         # usual dot product attention
+        attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
         attn_weights = dot_product_attention_weights(
             query,
             key,
             bias=attention_bias,
             deterministic=deterministic,
-            dtype=self.dtype,
+            dtype=attention_dtype,
         )
 
+        if self.attention_softmax_in_fp32:
+            attn_weights = attn_weights.astype(self.dtype)
+
         attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
         attn_output = self._merge_heads(attn_output)
         attn_output = self.o_proj(attn_output)

From 9718fed851e9368d4fdef5a5e13b535a440f6eed Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Mon, 16 Oct 2023 22:13:05 +0100
Subject: [PATCH 74/87] adds attn bias using `LlamaConfig.attention_bias`

---
 src/transformers/models/llama/modeling_flax_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 8f90c48c29069b..c7d37ea06aad21 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -202,7 +202,7 @@ def setup(self):
         dense = partial(
             nn.Dense,
             self.embed_dim,
-            use_bias=False,
+            use_bias=config.attention_bias,
             dtype=self.dtype,
             kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
         )

From fc6554b752a4d54f2ca438c4e05b30e3a3a71ff7 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 17 Oct 2023 08:21:32 +0100
Subject: [PATCH 75/87] adds Copied From comments to Flax Llama test

---
 tests/models/llama/test_modeling_flax_llama.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index a81398786b43b6..0f429a623e801d 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -103,12 +103,14 @@ def prepare_config_and_inputs(self):
 
         return (config, input_ids, input_mask)
 
+    # Copied from tests.models.gpt_neo.FlaxGPTNeoModelTester.prepare_config_and_inputs_for_common
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, input_ids, attention_mask = config_and_inputs
         inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
         return config, inputs_dict
 
+    # Copied from tests.models.gpt_neo.FlaxGPTNeoModelTester.check_use_cache_forward
     def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
         max_decoder_length = 20
         model = model_class_name(config)
@@ -139,6 +141,7 @@ def check_use_cache_forward(self, model_class_name, config, input_ids, attention
         diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
         self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
 
+    # Copied from tests.models.gpt_neo.FlaxGPTNeoModelTester.check_use_cache_forward_with_attn_mask
     def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
         max_decoder_length = 20
         model = model_class_name(config)

From f9fd7b645ac3ca021f60252adf385ea6a712c04c Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 17 Oct 2023 08:49:42 +0100
Subject: [PATCH 76/87] mistral and persimmon test change -copy from llama

---
 tests/models/mistral/test_modeling_mistral.py     | 2 +-
 tests/models/persimmon/test_modeling_persimmon.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 4bcb722c144e09..39f4b7b371ca9d 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -106,7 +106,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length))
 
         token_type_ids = None
         if self.use_token_type_ids:
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
index 0ffb999145be22..c17ba35dd340eb 100644
--- a/tests/models/persimmon/test_modeling_persimmon.py
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -104,7 +104,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length))
 
         token_type_ids = None
         if self.use_token_type_ids:

From 8b1f3748704d59364c5be5f53fa9aff60ae8a461 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 5 Nov 2023 16:22:11 +0000
Subject: [PATCH 77/87] updates docs index

---
 docs/source/en/index.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index 036c0bb4db253b..9b660b775981aa 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -93,7 +93,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [CLIP](model_doc/clip)                          |       ✅        |         ✅         |      ✅      |
 |                       [CLIPSeg](model_doc/clipseg)                       |       ✅        |         ❌         |      ❌      |
 |                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
-|                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ❌      |
+|                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ✅      |
 |              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
 |                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
 |                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
@@ -166,8 +166,8 @@ Flax), PyTorch, and/or TensorFlow.
 |                           [LED](model_doc/led)                           |       ✅        |         ✅         |      ❌      |
 |                         [LeViT](model_doc/levit)                         |       ✅        |         ❌         |      ❌      |
 |                          [LiLT](model_doc/lilt)                          |       ✅        |         ❌         |      ❌      |
-|                         [LLaMA](model_doc/llama)                         |       ✅        |         ❌         |      ❌      |
-|                        [Llama2](model_doc/llama2)                        |       ✅        |         ❌         |      ❌      |
+|                         [LLaMA](model_doc/llama)                         |       ✅        |         ❌         |      ✅      |
+|                        [Llama2](model_doc/llama2)                        |       ✅        |         ❌         |      ✅      |
 |                    [Longformer](model_doc/longformer)                    |       ✅        |         ✅         |      ❌      |
 |                        [LongT5](model_doc/longt5)                        |       ✅        |         ❌         |      ✅      |
 |                          [LUKE](model_doc/luke)                          |       ✅        |         ❌         |      ❌      |

From d9c7af6d37d4a183c67646609d6ae2797ebe1167 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 5 Nov 2023 16:22:20 +0000
Subject: [PATCH 78/87] removes Copied from in tests

it was preventing `make fix-copies` from succeeding
---
 tests/models/llama/test_modeling_flax_llama.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
index 0f429a623e801d..a81398786b43b6 100644
--- a/tests/models/llama/test_modeling_flax_llama.py
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -103,14 +103,12 @@ def prepare_config_and_inputs(self):
 
         return (config, input_ids, input_mask)
 
-    # Copied from tests.models.gpt_neo.FlaxGPTNeoModelTester.prepare_config_and_inputs_for_common
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         config, input_ids, attention_mask = config_and_inputs
         inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
         return config, inputs_dict
 
-    # Copied from tests.models.gpt_neo.FlaxGPTNeoModelTester.check_use_cache_forward
     def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
         max_decoder_length = 20
         model = model_class_name(config)
@@ -141,7 +139,6 @@ def check_use_cache_forward(self, model_class_name, config, input_ids, attention
         diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
         self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
 
-    # Copied from tests.models.gpt_neo.FlaxGPTNeoModelTester.check_use_cache_forward_with_attn_mask
     def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
         max_decoder_length = 20
         model = model_class_name(config)

From 30fe8e1b94d15fcabfdb7660ce27f1d347e37028 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Sun, 5 Nov 2023 16:43:38 +0000
Subject: [PATCH 79/87] quality and style

---
 tests/models/mistral/test_modeling_mistral.py     | 2 +-
 tests/models/persimmon/test_modeling_persimmon.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index 39f4b7b371ca9d..3b4d0350837f99 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -33,7 +33,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
index c17ba35dd340eb..e25a31b3ff4f0d 100644
--- a/tests/models/persimmon/test_modeling_persimmon.py
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -32,7 +32,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 

From d21d30692e7095a60e87533cc8291ee585e6c2ca Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Thu, 9 Nov 2023 17:36:12 +0000
Subject: [PATCH 80/87] ignores FlaxLlama input docstring

---
 utils/check_docstrings.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index cddd04ac6516d1..7722f507734d61 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -233,6 +233,8 @@
     "FlaxGPTJModel",
     "FlaxGPTNeoForCausalLM",
     "FlaxGPTNeoModel",
+    "FlaxLlamaForCausalLM",
+    "FlaxLlamaModel",
     "FlaxMBartForConditionalGeneration",
     "FlaxMBartForQuestionAnswering",
     "FlaxMBartForSequenceClassification",

From 339065191f627a612c701c332091a14467600572 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 24 Nov 2023 16:40:55 +0000
Subject: [PATCH 81/87] adds revision to `_CHECKPOINT_FOR_DOC`

---
 src/transformers/models/llama/modeling_flax_llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index c7d37ea06aad21..ea7d40e049cd69 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -632,7 +632,7 @@ class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
     module_class = FlaxLlamaModule
 
 
-append_call_sample_docstring(FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC)
+append_call_sample_docstring(FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC, revision="refs/pr/4")
 
 
 class FlaxLlamaForCausalLMModule(nn.Module):
@@ -716,4 +716,4 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         return model_kwargs
 
 
-append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC)
+append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC, revision="refs/pr/4")

From f303188543a2ef2a6221a1ef878f2c811a42c8ff Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 24 Nov 2023 17:28:06 +0000
Subject: [PATCH 82/87] repo consistency and quality

---
 src/transformers/models/llama/modeling_flax_llama.py | 8 ++++++--
 tests/models/phi/test_modeling_phi.py                | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index ea7d40e049cd69..cdd97172fdd8c6 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -632,7 +632,9 @@ class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
     module_class = FlaxLlamaModule
 
 
-append_call_sample_docstring(FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC, revision="refs/pr/4")
+append_call_sample_docstring(
+    FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC, revision="refs/pr/4"
+)
 
 
 class FlaxLlamaForCausalLMModule(nn.Module):
@@ -716,4 +718,6 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
         return model_kwargs
 
 
-append_call_sample_docstring(FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC, revision="refs/pr/4")
+append_call_sample_docstring(
+    FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC, revision="refs/pr/4"
+)
diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index 93c5ca85e9a4a3..bcc8ad293127bc 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -95,7 +95,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length))
 
         token_type_ids = None
         if self.use_token_type_ids:

From aea3e03f5489302743066ec4b42064909a70d690 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 24 Nov 2023 17:36:10 +0000
Subject: [PATCH 83/87] removes unused import

---
 tests/models/phi/test_modeling_phi.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index bcc8ad293127bc..21d81dd0f4e25b 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -23,7 +23,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 

From 842b5501363a709f512ab725b64f414d3905def2 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Fri, 24 Nov 2023 22:31:24 +0000
Subject: [PATCH 84/87] removes copied from from Phi test

now diverges from llama tests following FlaxLlama changes
---
 tests/models/phi/test_modeling_phi.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index 21d81dd0f4e25b..cf5224438ab290 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -23,7 +23,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -38,7 +38,6 @@
     )
 
 
-# Copied from tests.models.llama.test_modeling_llama.LlamaModelTester with Llama->Phi
 class PhiModelTester:
     def __init__(
         self,
@@ -95,7 +94,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length))
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
 
         token_type_ids = None
         if self.use_token_type_ids:

From 193052f8695791fea2302aea64e82c94260b7642 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Mon, 4 Dec 2023 13:48:54 +0000
Subject: [PATCH 85/87] adds `_REAL_CHECKPOINT_FOR_DOC`

---
 src/transformers/modeling_flax_utils.py         |  5 ++++-
 .../models/llama/modeling_flax_llama.py         | 17 ++++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index af0ac25a668098..eb14216c5cd994 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -1267,7 +1267,9 @@ def overwrite_call_docstring(model_class, docstring):
     model_class.__call__ = add_start_docstrings_to_model_forward(docstring)(model_class.__call__)
 
 
-def append_call_sample_docstring(model_class, checkpoint, output_type, config_class, mask=None, revision=None):
+def append_call_sample_docstring(
+    model_class, checkpoint, output_type, config_class, mask=None, revision=None, real_checkpoint=None
+):
     model_class.__call__ = copy_func(model_class.__call__)
     model_class.__call__ = add_code_sample_docstrings(
         checkpoint=checkpoint,
@@ -1275,6 +1277,7 @@ def append_call_sample_docstring(model_class, checkpoint, output_type, config_cl
         config_class=config_class,
         model_cls=model_class.__name__,
         revision=revision,
+        real_checkpoint=real_checkpoint,
     )(model_class.__call__)
 
 
diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index cdd97172fdd8c6..4cf5461a684e89 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -40,7 +40,8 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "LlamaConfig"
-_CHECKPOINT_FOR_DOC = "openlm-research/open_llama_3b_v2"
+_CHECKPOINT_FOR_DOC = "afmck/testing-llama-tiny"
+_REAL_CHECKPOINT_FOR_DOC = "openlm-research/open_llama_3b_v2"
 
 LLAMA_START_DOCSTRING = r"""
 
@@ -633,7 +634,12 @@ class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
 
 
 append_call_sample_docstring(
-    FlaxLlamaModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC, revision="refs/pr/4"
+    FlaxLlamaModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutput,
+    _CONFIG_FOR_DOC,
+    revision="refs/pr/4",
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
 )
 
 
@@ -719,5 +725,10 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
 
 
 append_call_sample_docstring(
-    FlaxLlamaForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC, revision="refs/pr/4"
+    FlaxLlamaForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutput,
+    _CONFIG_FOR_DOC,
+    revision="refs/pr/4",
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
 )

From b33a54da745701124597cf931930e67198407427 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 5 Dec 2023 06:48:02 +0000
Subject: [PATCH 86/87] removes refs from pr tests

---
 src/transformers/models/llama/modeling_flax_llama.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
index 4cf5461a684e89..9430eba41c6548 100644
--- a/src/transformers/models/llama/modeling_flax_llama.py
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -638,7 +638,6 @@ class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
     _CHECKPOINT_FOR_DOC,
     FlaxBaseModelOutput,
     _CONFIG_FOR_DOC,
-    revision="refs/pr/4",
     real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
 )
 
@@ -729,6 +728,5 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs):
     _CHECKPOINT_FOR_DOC,
     FlaxCausalLMOutput,
     _CONFIG_FOR_DOC,
-    revision="refs/pr/4",
     real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
 )

From b9ed34fb0fcf5d61be0fc9d2d02072bf63dae282 Mon Sep 17 00:00:00 2001
From: Alex McKinney <alex.f.mckinney@gmail.com>
Date: Tue, 5 Dec 2023 07:07:53 +0000
Subject: [PATCH 87/87] reformat to make ruff happy

---
 src/transformers/models/bloom/modeling_bloom.py       | 2 +-
 src/transformers/models/fuyu/image_processing_fuyu.py | 2 +-
 src/transformers/models/mpt/modeling_mpt.py           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 3064d5bae69673..ad1e87ec2c26e2 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -489,7 +489,7 @@ def _convert_to_standard_cache(
 
     @staticmethod
     def _convert_to_bloom_cache(
-        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
     ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
         """
         Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py
index 0e415980c97fdd..ba5d52a58a80af 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@@ -54,7 +54,7 @@
 
 
 def make_list_of_list_of_images(
-    images: Union[List[List[ImageInput]], List[ImageInput], ImageInput]
+    images: Union[List[List[ImageInput]], List[ImageInput], ImageInput],
 ) -> List[List[ImageInput]]:
     if is_valid_image(images):
         return [[images]]
diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py
index e2d12f0e777ee7..ae8e7e9031129f 100644
--- a/src/transformers/models/mpt/modeling_mpt.py
+++ b/src/transformers/models/mpt/modeling_mpt.py
@@ -265,7 +265,7 @@ def _init_weights(self, module: nn.Module):
 
     @staticmethod
     def _convert_to_mpt_cache(
-        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
     ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
         """
         Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))