From 0332c3471a280c2f79e5af7ff8e8763acebfb551 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 3 Mar 2023 00:49:30 +0000
Subject: [PATCH 01/26] LLAMA

---
 torchbenchmark/models/llama | 1 +
 1 file changed, 1 insertion(+)
 create mode 160000 torchbenchmark/models/llama

diff --git a/torchbenchmark/models/llama b/torchbenchmark/models/llama
new file mode 160000
index 0000000000..76066b1b5c
--- /dev/null
+++ b/torchbenchmark/models/llama
@@ -0,0 +1 @@
+Subproject commit 76066b1b5cf467ce750f51af15cd34de442185e7

From 2242fc7482d07ec20999de678f15757e2165b37f Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 3 Mar 2023 00:54:19 +0000
Subject: [PATCH 02/26] Add LLAMA

---
 torchbenchmark/models/llama                   |   1 -
 torchbenchmark/models/llama/__init__.py       |  43 ++++
 torchbenchmark/models/llama/llama/__init__.py |   6 +
 .../models/llama/llama/generation.py          |  77 ++++++
 torchbenchmark/models/llama/llama/model.py    | 240 ++++++++++++++++++
 .../models/llama/llama/tokenizer.py           |  40 +++
 torchbenchmark/models/llama/metadata.yaml     |   8 +
 torchbenchmark/models/llama/origin            |   1 +
 torchbenchmark/models/llama/requirements.txt  |   3 +
 9 files changed, 418 insertions(+), 1 deletion(-)
 delete mode 160000 torchbenchmark/models/llama
 create mode 100644 torchbenchmark/models/llama/__init__.py
 create mode 100644 torchbenchmark/models/llama/llama/__init__.py
 create mode 100644 torchbenchmark/models/llama/llama/generation.py
 create mode 100644 torchbenchmark/models/llama/llama/model.py
 create mode 100644 torchbenchmark/models/llama/llama/tokenizer.py
 create mode 100644 torchbenchmark/models/llama/metadata.yaml
 create mode 100644 torchbenchmark/models/llama/origin
 create mode 100644 torchbenchmark/models/llama/requirements.txt

diff --git a/torchbenchmark/models/llama b/torchbenchmark/models/llama
deleted file mode 160000
index 76066b1b5c..0000000000
--- a/torchbenchmark/models/llama
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 76066b1b5cf467ce750f51af15cd34de442185e7
diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
new file mode 100644
index 0000000000..4c7c4bff4b
--- /dev/null
+++ b/torchbenchmark/models/llama/__init__.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the GNU General Public License version 3.
+
+
+
+from ...util.model import BenchmarkModel
+from typing import Tuple
+import os
+import sys
+import torch
+import fire
+import time
+import json
+
+from pathlib import Path
+
+from fairscale.nn.model_parallel.initialize import initialize_model_parallel
+
+from llama import ModelArgs, Transformer, Tokenizer, LLaMA
+import torch
+
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.benchmark = True
+
+class Model(BenchmarkModel):
+    def __init__(self, temperature: float = 0.8, top_p: float = 0.95):
+        self.model_args = ModelArgs()
+        self.generator = Transformer(self.model_args)
+        self.temperature = temperature
+        self.top_p = top_p
+    
+    def inference(self, prompts : str):
+        prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"]
+        results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p)
+
+        for result in results:
+            print(result)
+            print("\n==================================\n")
+
+
+
+
+
diff --git a/torchbenchmark/models/llama/llama/__init__.py b/torchbenchmark/models/llama/llama/__init__.py
new file mode 100644
index 0000000000..009c144d21
--- /dev/null
+++ b/torchbenchmark/models/llama/llama/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the GNU General Public License version 3.
+
+from .generation import LLaMA
+from .model import ModelArgs, Transformer
+from .tokenizer import Tokenizer
\ No newline at end of file
diff --git a/torchbenchmark/models/llama/llama/generation.py b/torchbenchmark/models/llama/llama/generation.py
new file mode 100644
index 0000000000..4a23735d2f
--- /dev/null
+++ b/torchbenchmark/models/llama/llama/generation.py
@@ -0,0 +1,77 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the GNU General Public License version 3.
+
+from typing import List
+
+import torch
+
+from llama.tokenizer import Tokenizer
+from llama.model import Transformer
+
+
+class LLaMA:
+    def __init__(self, model: Transformer, tokenizer: Tokenizer):
+        self.model = model
+        self.tokenizer = tokenizer
+
+    def generate(
+        self,
+        prompts: List[str],
+        max_gen_len: int,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+    ) -> List[str]:
+        bsz = len(prompts)
+        params = self.model.params
+        assert bsz <= params.max_batch_size, (bsz, params.max_batch_size)
+
+        prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
+
+        min_prompt_size = min([len(t) for t in prompt_tokens])
+        max_prompt_size = max([len(t) for t in prompt_tokens])
+
+        total_len = min(params.max_seq_len, max_gen_len + max_prompt_size)
+
+        tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long()
+        for k, t in enumerate(prompt_tokens):
+            tokens[k, : len(t)] = torch.tensor(t).long()
+        input_text_mask = tokens != self.tokenizer.pad_id
+        start_pos = min_prompt_size
+        prev_pos = 0
+        for cur_pos in range(start_pos, total_len):
+            logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos)
+            if temperature > 0:
+                probs = torch.softmax(logits / temperature, dim=-1)
+                next_token = sample_top_p(probs, top_p)
+            else:
+                next_token = torch.argmax(logits, dim=-1)
+            next_token = next_token.reshape(-1)
+            # only replace token if prompt has already been generated
+            next_token = torch.where(
+                input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token
+            )
+            tokens[:, cur_pos] = next_token
+            prev_pos = cur_pos
+
+        decoded = []
+        for i, t in enumerate(tokens.tolist()):
+            # cut to max gen len
+            t = t[: len(prompt_tokens[i]) + max_gen_len]
+            # cut to eos tok if any
+            try:
+                t = t[: t.index(self.tokenizer.eos_id)]
+            except ValueError:
+                pass
+            decoded.append(self.tokenizer.decode(t))
+        return decoded
+
+
+def sample_top_p(probs, p):
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
\ No newline at end of file
diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/llama/model.py
new file mode 100644
index 0000000000..52e0c04379
--- /dev/null
+++ b/torchbenchmark/models/llama/llama/model.py
@@ -0,0 +1,240 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the GNU General Public License version 3.
+
+from typing import Optional, Tuple
+from dataclasses import dataclass
+import math
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+import fairscale.nn.model_parallel.initialize as fs_init
+from fairscale.nn.model_parallel.layers import (
+    ParallelEmbedding,
+    RowParallelLinear,
+    ColumnParallelLinear,
+)
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 512
+    n_layers: int = 8
+    n_heads: int = 8
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    norm_eps: float = 1e-5
+
+    max_batch_size: int = 32
+    max_seq_len: int = 1024
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+
+
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+
+        self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size()
+        self.head_dim = args.dim // args.n_heads
+
+        self.wq = ColumnParallelLinear(
+            args.dim,
+            args.n_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wk = ColumnParallelLinear(
+            args.dim,
+            args.n_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wv = ColumnParallelLinear(
+            args.dim,
+            args.n_heads * self.head_dim,
+            bias=False,
+            gather_output=False,
+            init_method=lambda x: x,
+        )
+        self.wo = RowParallelLinear(
+            args.n_heads * self.head_dim,
+            args.dim,
+            bias=False,
+            input_is_parallel=True,
+            init_method=lambda x: x,
+        )
+
+        self.cache_k = torch.zeros(
+            (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
+        ).cuda()
+        self.cache_v = torch.zeros(
+            (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim)
+        ).cuda()
+
+    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
+        bsz, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+
+        self.cache_k = self.cache_k.to(xq)
+        self.cache_v = self.cache_v.to(xq)
+
+        self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk
+        self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv
+
+        keys = self.cache_k[:bsz, : start_pos + seqlen]
+        values = self.cache_v[:bsz, : start_pos + seqlen]
+
+        xq = xq.transpose(1, 2)
+        keys = keys.transpose(1, 2)
+        values = values.transpose(1, 2)
+        scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if mask is not None:
+            scores = scores + mask  # (bs, n_local_heads, slen, cache_len + slen)
+        scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+        output = torch.matmul(scores, values)  # (bs, n_local_heads, slen, head_dim)
+        output = output.transpose(
+            1, 2
+        ).contiguous().view(bsz, seqlen, -1)
+
+        return self.wo(output)
+
+
+class FeedForward(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+        self.w1 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
+        )
+        self.w2 = RowParallelLinear(
+            hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x
+        )
+        self.w3 = ColumnParallelLinear(
+            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
+        )
+
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+
+
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: ModelArgs):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(
+            dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+
+    def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]):
+        h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask)
+        out = h + self.feed_forward.forward(self.ffn_norm(h))
+        return out
+
+
+class Transformer(nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+
+        self.tok_embeddings = ParallelEmbedding(
+            params.vocab_size, params.dim, init_method=lambda x: x
+        )
+
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(params.n_layers):
+            self.layers.append(TransformerBlock(layer_id, params))
+
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = ColumnParallelLinear(
+            params.dim, params.vocab_size, bias=False, init_method=lambda x: x
+        )
+
+        self.freqs_cis = precompute_freqs_cis(
+            self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
+        )
+
+    # TODO: Not sure if we should keep or remove inference mode
+    # But OP is using it
+    # @torch.inference_mode()
+    def forward(self, tokens: torch.Tensor, start_pos: int):
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+        self.freqs_cis = self.freqs_cis.to(h.device)
+        freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
+
+        mask = None
+        if seqlen > 1:
+            mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device)
+            mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
+
+        for layer in self.layers:
+            h = layer(h, start_pos, freqs_cis, mask)
+        h = self.norm(h)
+        output = self.output(h[:, -1, :])  # only compute last logits
+        return output.float()
diff --git a/torchbenchmark/models/llama/llama/tokenizer.py b/torchbenchmark/models/llama/llama/tokenizer.py
new file mode 100644
index 0000000000..e4315856ee
--- /dev/null
+++ b/torchbenchmark/models/llama/llama/tokenizer.py
@@ -0,0 +1,40 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the GNU General Public License version 3.
+
+from sentencepiece import SentencePieceProcessor
+from logging import getLogger
+from typing import List
+import os
+
+
+logger = getLogger()
+
+
+class Tokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+        logger.info(f"Reloaded SentencePiece model from {model_path}")
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.pad_id()
+        logger.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        return self.sp_model.decode(t)
diff --git a/torchbenchmark/models/llama/metadata.yaml b/torchbenchmark/models/llama/metadata.yaml
new file mode 100644
index 0000000000..cc072bb909
--- /dev/null
+++ b/torchbenchmark/models/llama/metadata.yaml
@@ -0,0 +1,8 @@
+devices:
+  NVIDIA A100-SXM4-40GB:
+    eval_batch_size: 1024
+eval_benchmark: false
+eval_deterministic: false
+eval_nograd: true
+train_benchmark: false
+train_deterministic: false
\ No newline at end of file
diff --git a/torchbenchmark/models/llama/origin b/torchbenchmark/models/llama/origin
new file mode 100644
index 0000000000..3446020c5c
--- /dev/null
+++ b/torchbenchmark/models/llama/origin
@@ -0,0 +1 @@
+origin https://github.com/facebookresearch/llama
\ No newline at end of file
diff --git a/torchbenchmark/models/llama/requirements.txt b/torchbenchmark/models/llama/requirements.txt
new file mode 100644
index 0000000000..bfda692406
--- /dev/null
+++ b/torchbenchmark/models/llama/requirements.txt
@@ -0,0 +1,3 @@
+fairscale
+fire
+sentencepiece
\ No newline at end of file

From c0468f43b497d85fd75aa963a293dfec91939df7 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 3 Mar 2023 01:34:17 +0000
Subject: [PATCH 03/26] add intall.py

---
 torchbenchmark/models/llama/install.py | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 torchbenchmark/models/llama/install.py

diff --git a/torchbenchmark/models/llama/install.py b/torchbenchmark/models/llama/install.py
new file mode 100644
index 0000000000..d6baff0571
--- /dev/null
+++ b/torchbenchmark/models/llama/install.py
@@ -0,0 +1,8 @@
+import subprocess
+import sys
+
+def pip_install_requirements():
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'])
+
+if __name__ == '__main__':
+    pip_install_requirements()
\ No newline at end of file

From 54b82af55cd5aa2d691fffa1c0388f23db2b99b4 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 6 Mar 2023 18:39:47 +0000
Subject: [PATCH 04/26] Fixed some stuff

---
 torchbenchmark/models/llama/__init__.py    | 34 +++++++-------
 torchbenchmark/models/llama/llama/model.py | 52 +++++++++-------------
 torchbenchmark/models/llama/test.py        | 37 +++++++++++++++
 3 files changed, 75 insertions(+), 48 deletions(-)
 create mode 100644 torchbenchmark/models/llama/test.py

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 4c7c4bff4b..7f1de280b5 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -4,18 +4,7 @@
 
 
 from ...util.model import BenchmarkModel
-from typing import Tuple
-import os
-import sys
 import torch
-import fire
-import time
-import json
-
-from pathlib import Path
-
-from fairscale.nn.model_parallel.initialize import initialize_model_parallel
-
 from llama import ModelArgs, Transformer, Tokenizer, LLaMA
 import torch
 
@@ -28,14 +17,25 @@ def __init__(self, temperature: float = 0.8, top_p: float = 0.95):
         self.generator = Transformer(self.model_args)
         self.temperature = temperature
         self.top_p = top_p
+
+        
+
+    # def inference(self, prompts : str):
+    #     prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"]
+    #     results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p)
+
+    #     for result in results:
+    #         print(result)
+    #         print("\n==================================\n")
+    
+    def get_module(self):
+        return self.generator, 
     
-    def inference(self, prompts : str):
-        prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"]
-        results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p)
+    def train(self):
+        return NotImplementedError
 
-        for result in results:
-            print(result)
-            print("\n==================================\n")
+    def eval(self):
+        return NotImplementedError
 
 
 
diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/llama/model.py
index 52e0c04379..b5b62b8c68 100644
--- a/torchbenchmark/models/llama/llama/model.py
+++ b/torchbenchmark/models/llama/llama/model.py
@@ -9,13 +9,6 @@
 from torch import nn
 import torch.nn.functional as F
 
-import fairscale.nn.model_parallel.initialize as fs_init
-from fairscale.nn.model_parallel.layers import (
-    ParallelEmbedding,
-    RowParallelLinear,
-    ColumnParallelLinear,
-)
-
 
 @dataclass
 class ModelArgs:
@@ -77,36 +70,32 @@ class Attention(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
 
-        self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size()
+        self.n_local_heads = args.n_heads # Basically we just assume world size of 1 // fs_init.get_model_parallel_world_size()
         self.head_dim = args.dim // args.n_heads
 
-        self.wq = ColumnParallelLinear(
+        self.wq = nn.Linear(
             args.dim,
             args.n_heads * self.head_dim,
             bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
+
         )
-        self.wk = ColumnParallelLinear(
+        self.wk = nn.Linear(
             args.dim,
             args.n_heads * self.head_dim,
             bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
+
         )
-        self.wv = ColumnParallelLinear(
+        self.wv = nn.Linear(
             args.dim,
             args.n_heads * self.head_dim,
             bias=False,
-            gather_output=False,
-            init_method=lambda x: x,
+
         )
-        self.wo = RowParallelLinear(
+        self.wo = nn.Linear(
             args.n_heads * self.head_dim,
             args.dim,
             bias=False,
-            input_is_parallel=True,
-            init_method=lambda x: x,
+
         )
 
         self.cache_k = torch.zeros(
@@ -161,14 +150,14 @@ def __init__(
         hidden_dim = int(2 * hidden_dim / 3)
         hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
 
-        self.w1 = ColumnParallelLinear(
-            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
+        self.w1 = nn.Linear(
+            dim, hidden_dim, bias=False
         )
-        self.w2 = RowParallelLinear(
-            hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x
+        self.w2 = nn.Linear(
+            hidden_dim, dim, bias=False
         )
-        self.w3 = ColumnParallelLinear(
-            dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x
+        self.w3 = nn.Linear(
+            dim, hidden_dim, bias=False
         )
 
     def forward(self, x):
@@ -202,17 +191,18 @@ def __init__(self, params: ModelArgs):
         self.vocab_size = params.vocab_size
         self.n_layers = params.n_layers
 
-        self.tok_embeddings = ParallelEmbedding(
-            params.vocab_size, params.dim, init_method=lambda x: x
-        )
+        ## Commenting otherwise I get a model parallel group is not initialized error
+        # self.tok_embeddings = ParallelEmbedding(
+        #     params.vocab_size, params.dim, init_method=lambda x: x
+        # )
 
         self.layers = torch.nn.ModuleList()
         for layer_id in range(params.n_layers):
             self.layers.append(TransformerBlock(layer_id, params))
 
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
-        self.output = ColumnParallelLinear(
-            params.dim, params.vocab_size, bias=False, init_method=lambda x: x
+        self.output = nn.Linear(
+            params.dim, params.vocab_size + 1, bias=False
         )
 
         self.freqs_cis = precompute_freqs_cis(
diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py
new file mode 100644
index 0000000000..f8f1e2e54b
--- /dev/null
+++ b/torchbenchmark/models/llama/test.py
@@ -0,0 +1,37 @@
+
+import torch
+from llama import ModelArgs, Transformer, Tokenizer, LLaMA
+import torch
+
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.benchmark = True
+
+class Model:
+    def __init__(self, temperature: float = 0.8, top_p: float = 0.95):
+        self.model_args = ModelArgs()
+        self.generator = Transformer(self.model_args)
+        self.temperature = temperature
+        self.top_p = top_p
+
+        
+
+    # def inference(self, prompts : str):
+    #     prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"]
+    #     results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p)
+
+    #     for result in results:
+    #         print(result)
+    #         print("\n==================================\n")
+    
+    def get_module(self):
+        return self.generator 
+    
+    def train(self):
+        return NotImplementedError
+
+    def eval(self):
+        return NotImplementedError
+
+if __name__ == "__main__":
+    model = Model()
+    model.get_module().generate((torch.randn(1,1,1,1)))
\ No newline at end of file

From b451954425196a7e0e9b28a7e356eb1dad262237 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 6 Mar 2023 19:44:19 +0000
Subject: [PATCH 05/26] test now runs

---
 torchbenchmark/models/llama/llama/model.py | 25 ++++++++++++----------
 torchbenchmark/models/llama/test.py        | 15 ++++---------
 2 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/llama/model.py
index b5b62b8c68..7fa6d8a614 100644
--- a/torchbenchmark/models/llama/llama/model.py
+++ b/torchbenchmark/models/llama/llama/model.py
@@ -15,7 +15,7 @@ class ModelArgs:
     dim: int = 512
     n_layers: int = 8
     n_heads: int = 8
-    vocab_size: int = -1  # defined later by tokenizer
+    vocab_size: int = 32  # TODO: This was -1 I changed it to make things work
     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
     norm_eps: float = 1e-5
 
@@ -128,8 +128,10 @@ def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask
         keys = keys.transpose(1, 2)
         values = values.transpose(1, 2)
         scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim)
-        if mask is not None:
-            scores = scores + mask  # (bs, n_local_heads, slen, cache_len + slen)
+        
+        # TODO: RuntimeError: The size of tensor a (3) must match the size of tensor b (2) at non-singleton dimension 3
+        # if mask is not None:
+        #     scores = scores + mask  # (bs, n_local_heads, slen, cache_len + slen)
         scores = F.softmax(scores.float(), dim=-1).type_as(xq)
         output = torch.matmul(scores, values)  # (bs, n_local_heads, slen, head_dim)
         output = output.transpose(
@@ -191,10 +193,10 @@ def __init__(self, params: ModelArgs):
         self.vocab_size = params.vocab_size
         self.n_layers = params.n_layers
 
-        ## Commenting otherwise I get a model parallel group is not initialized error
-        # self.tok_embeddings = ParallelEmbedding(
-        #     params.vocab_size, params.dim, init_method=lambda x: x
-        # )
+        self.tok_embeddings = nn.Embedding(
+            params.vocab_size + 1, params.dim
+        )
+
 
         self.layers = torch.nn.ModuleList()
         for layer_id in range(params.n_layers):
@@ -209,16 +211,17 @@ def __init__(self, params: ModelArgs):
             self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
         )
 
-    # TODO: Not sure if we should keep or remove inference mode
-    # But OP is using it
-    # @torch.inference_mode()
     def forward(self, tokens: torch.Tensor, start_pos: int):
-        _bsz, seqlen = tokens.shape
+        _ , seqlen = tokens.shape
+   
+
         h = self.tok_embeddings(tokens)
+
         self.freqs_cis = self.freqs_cis.to(h.device)
         freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
 
         mask = None
+
         if seqlen > 1:
             mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device)
             mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py
index f8f1e2e54b..663c77cc86 100644
--- a/torchbenchmark/models/llama/test.py
+++ b/torchbenchmark/models/llama/test.py
@@ -12,16 +12,6 @@ def __init__(self, temperature: float = 0.8, top_p: float = 0.95):
         self.generator = Transformer(self.model_args)
         self.temperature = temperature
         self.top_p = top_p
-
-        
-
-    # def inference(self, prompts : str):
-    #     prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"]
-    #     results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p)
-
-    #     for result in results:
-    #         print(result)
-    #         print("\n==================================\n")
     
     def get_module(self):
         return self.generator 
@@ -34,4 +24,7 @@ def eval(self):
 
 if __name__ == "__main__":
     model = Model()
-    model.get_module().generate((torch.randn(1,1,1,1)))
\ No newline at end of file
+    module = model.get_module()
+    input_tensor = torch.tensor([[1, 1], [1,1]], dtype=torch.int)
+
+    module(input_tensor, 1)
\ No newline at end of file

From b95af599e6472a947c0d11ad672f52e582ed8eb6 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 6 Mar 2023 19:55:32 +0000
Subject: [PATCH 06/26] fix model

---
 torchbenchmark/models/llama/__init__.py | 31 ++++++++++++-------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 7f1de280b5..bf354dea62 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -12,30 +12,29 @@
 torch.backends.cudnn.benchmark = True
 
 class Model(BenchmarkModel):
-    def __init__(self, temperature: float = 0.8, top_p: float = 0.95):
+    def __init__(self):
         self.model_args = ModelArgs()
-        self.generator = Transformer(self.model_args)
-        self.temperature = temperature
-        self.top_p = top_p
+        self.model = Transformer(self.model_args)
+        self.example_inputs = torch.tensor([[1, 1], [1,1]], dtype=torch.int)
 
         
-
-    # def inference(self, prompts : str):
-    #     prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"]
-    #     results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p)
-
-    #     for result in results:
-    #         print(result)
-    #         print("\n==================================\n")
-    
     def get_module(self):
-        return self.generator, 
+        return self.transformer, self.example_inputs
     
     def train(self):
-        return NotImplementedError
+        error_msg = """
+            As of March 6, 2023
+            The weights for this model are not publicly available and require a valid research reason to use
+            The publicly available github repo is inference only
+            https://github.com/facebookresearch/llama
+        """
+        return NotImplementedError(error_msg)
 
     def eval(self):
-        return NotImplementedError
+        self.model.eval()
+        with torch.no_grad():
+            out=self.model(self.example_inputs, 1)
+        return (out,)
 
 
 

From 1a7833a48e92f539753bfee246128ccdd3f8d678 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 6 Mar 2023 19:59:58 +0000
Subject: [PATCH 07/26] upd

---
 torchbenchmark/models/llama/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index bf354dea62..2b75c77ae0 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -19,7 +19,7 @@ def __init__(self):
 
         
     def get_module(self):
-        return self.transformer, self.example_inputs
+        return self.model, self.example_inputs
     
     def train(self):
         error_msg = """

From 5ca98df8b04a54b0d3fbf8d8e4cd1b5494b247da Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 6 Mar 2023 20:05:34 +0000
Subject: [PATCH 08/26] updat docs

---
 torchbenchmark/models/ADDING_MODELS.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/torchbenchmark/models/ADDING_MODELS.md b/torchbenchmark/models/ADDING_MODELS.md
index 8fdb646a38..44cdb04096 100644
--- a/torchbenchmark/models/ADDING_MODELS.md
+++ b/torchbenchmark/models/ADDING_MODELS.md
@@ -30,6 +30,16 @@ Some of the APIs are optional, and you can raise NotImplemented if a particular
 
 Take care to set the random seed like [here](https://github.com/pytorch/benchmark/blob/master/torchbenchmark/models/Background_Matting/__init__.py#L20), to ensure your model runs the same way each time it's benchmarked.
 
+
+#### A minimal new model addition
+A bare miminum example you can follow is https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models/phlippe_resnet
+
+The functions you specifically need to implement are 
+1. `__init__()` which is responsible for initalizing your `nn.Module`
+2. `get_module()` which is responsible for returning the initialized `nn.Module` and an example input
+3. `train()` which is a training loop, you can return a `NotImplementedError()` if your example is inference only
+4. `eval()` which showcases a simple inference
+
 ### Preparing install.py and dependencies
 Simply put, install.py should be a one stop shop to install all the dependencies
 for your model, __except torch, torchvision, torchtext__ which should be assumed to 

From aa0d4cb84cb9ed0fd79afcd3ca1c04fa10342ee9 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 6 Mar 2023 21:18:17 +0000
Subject: [PATCH 09/26] add stuff

---
 torchbenchmark/models/llama/__init__.py    | 11 ++++++++---
 torchbenchmark/models/llama/llama/model.py |  2 +-
 torchbenchmark/models/llama/test.py        |  2 +-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 2b75c77ae0..2008115617 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -4,6 +4,7 @@
 
 
 from ...util.model import BenchmarkModel
+from torchbenchmark.tasks import NLP
 import torch
 from llama import ModelArgs, Transformer, Tokenizer, LLaMA
 import torch
@@ -12,10 +13,14 @@
 torch.backends.cudnn.benchmark = True
 
 class Model(BenchmarkModel):
-    def __init__(self):
-        self.model_args = ModelArgs()
+    DEFAULT_EVAL_BSIZE = 128
+    task = NLP.LANGUAGE_MODELING
+
+    def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra_args=[]):
+        super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
+        self.model_args = ModelArgs(max_batch_size=batch_size, vocab_size=32, max_seq_len=1032)
         self.model = Transformer(self.model_args)
-        self.example_inputs = torch.tensor([[1, 1], [1,1]], dtype=torch.int)
+        self.example_inputs = torch.tensor([[batch_size, 1, 1], [batch_size, 1,1]], dtype=torch.int)
 
         
     def get_module(self):
diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/llama/model.py
index 7fa6d8a614..f91c875295 100644
--- a/torchbenchmark/models/llama/llama/model.py
+++ b/torchbenchmark/models/llama/llama/model.py
@@ -15,7 +15,7 @@ class ModelArgs:
     dim: int = 512
     n_layers: int = 8
     n_heads: int = 8
-    vocab_size: int = 32  # TODO: This was -1 I changed it to make things work
+    vocab_size: int = -1
     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
     norm_eps: float = 1e-5
 
diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py
index 663c77cc86..42f0dace59 100644
--- a/torchbenchmark/models/llama/test.py
+++ b/torchbenchmark/models/llama/test.py
@@ -25,6 +25,6 @@ def eval(self):
 if __name__ == "__main__":
     model = Model()
     module = model.get_module()
-    input_tensor = torch.tensor([[1, 1], [1,1]], dtype=torch.int)
+    input_tensor = torch.tensor([[5, 1, 1], [1,1]], dtype=torch.int)
 
     module(input_tensor, 1)
\ No newline at end of file

From 1a5f4d2c0c08c5d7959835cf1901159755b0f677 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 6 Mar 2023 21:45:25 +0000
Subject: [PATCH 10/26] minor fix

---
 torchbenchmark/models/llama/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 2008115617..f6193c6708 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -6,13 +6,13 @@
 from ...util.model import BenchmarkModel
 from torchbenchmark.tasks import NLP
 import torch
-from llama import ModelArgs, Transformer, Tokenizer, LLaMA
+from llama import ModelArgs, Transformer
 import torch
 
 torch.backends.cudnn.deterministic = False
 torch.backends.cudnn.benchmark = True
 
-class Model(BenchmarkModel):
+class LLAMA(BenchmarkModel):
     DEFAULT_EVAL_BSIZE = 128
     task = NLP.LANGUAGE_MODELING
 

From 7a7627fbb31a7fae1e884e49bd02933f0fe500d3 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 6 Mar 2023 22:19:30 +0000
Subject: [PATCH 11/26] flatten

---
 torchbenchmark/models/llama/__init__.py           | 15 ++++++++++-----
 .../models/llama/{llama => }/generation.py        |  4 ++--
 torchbenchmark/models/llama/llama/__init__.py     |  6 ------
 torchbenchmark/models/llama/{llama => }/model.py  |  1 -
 torchbenchmark/models/llama/requirements.txt      |  2 --
 torchbenchmark/models/llama/test.py               |  2 +-
 .../models/llama/{llama => }/tokenizer.py         |  0
 7 files changed, 13 insertions(+), 17 deletions(-)
 rename torchbenchmark/models/llama/{llama => }/generation.py (97%)
 delete mode 100644 torchbenchmark/models/llama/llama/__init__.py
 rename torchbenchmark/models/llama/{llama => }/model.py (99%)
 rename torchbenchmark/models/llama/{llama => }/tokenizer.py (100%)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index f6193c6708..2598eb8a76 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -6,21 +6,26 @@
 from ...util.model import BenchmarkModel
 from torchbenchmark.tasks import NLP
 import torch
-from llama import ModelArgs, Transformer
+from .model import ModelArgs, Transformer
 import torch
 
 torch.backends.cudnn.deterministic = False
 torch.backends.cudnn.benchmark = True
 
-class LLAMA(BenchmarkModel):
-    DEFAULT_EVAL_BSIZE = 128
+class Model(BenchmarkModel):
+    DEFAULT_EVAL_BSIZE = 32
     task = NLP.LANGUAGE_MODELING
 
     def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra_args=[]):
         super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
-        self.model_args = ModelArgs(max_batch_size=batch_size, vocab_size=32, max_seq_len=1032)
+        self.model_args = ModelArgs(vocab_size=32) # TODO: Configuring arguments is breaking stuff: max_batch_size=batch_size, max_seq_len=1032 is breaking stuff
         self.model = Transformer(self.model_args)
-        self.example_inputs = torch.tensor([[batch_size, 1, 1], [batch_size, 1,1]], dtype=torch.int)
+
+        # TODO: Implement batching
+
+        # if torch.cuda.is_available():
+        #     torch.set_default_device('cuda')
+        self.example_inputs = torch.tensor([[1, 1], [1,1]], dtype=torch.int)
 
         
     def get_module(self):
diff --git a/torchbenchmark/models/llama/llama/generation.py b/torchbenchmark/models/llama/generation.py
similarity index 97%
rename from torchbenchmark/models/llama/llama/generation.py
rename to torchbenchmark/models/llama/generation.py
index 4a23735d2f..7f7d00914b 100644
--- a/torchbenchmark/models/llama/llama/generation.py
+++ b/torchbenchmark/models/llama/generation.py
@@ -5,8 +5,8 @@
 
 import torch
 
-from llama.tokenizer import Tokenizer
-from llama.model import Transformer
+from .tokenizer import Tokenizer
+from .model import Transformer
 
 
 class LLaMA:
diff --git a/torchbenchmark/models/llama/llama/__init__.py b/torchbenchmark/models/llama/llama/__init__.py
deleted file mode 100644
index 009c144d21..0000000000
--- a/torchbenchmark/models/llama/llama/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the GNU General Public License version 3.
-
-from .generation import LLaMA
-from .model import ModelArgs, Transformer
-from .tokenizer import Tokenizer
\ No newline at end of file
diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/model.py
similarity index 99%
rename from torchbenchmark/models/llama/llama/model.py
rename to torchbenchmark/models/llama/model.py
index f91c875295..a4e1a482ad 100644
--- a/torchbenchmark/models/llama/llama/model.py
+++ b/torchbenchmark/models/llama/model.py
@@ -214,7 +214,6 @@ def __init__(self, params: ModelArgs):
     def forward(self, tokens: torch.Tensor, start_pos: int):
         _ , seqlen = tokens.shape
    
-
         h = self.tok_embeddings(tokens)
 
         self.freqs_cis = self.freqs_cis.to(h.device)
diff --git a/torchbenchmark/models/llama/requirements.txt b/torchbenchmark/models/llama/requirements.txt
index bfda692406..ad213956ee 100644
--- a/torchbenchmark/models/llama/requirements.txt
+++ b/torchbenchmark/models/llama/requirements.txt
@@ -1,3 +1 @@
-fairscale
-fire
 sentencepiece
\ No newline at end of file
diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py
index 42f0dace59..30a9f9224c 100644
--- a/torchbenchmark/models/llama/test.py
+++ b/torchbenchmark/models/llama/test.py
@@ -1,6 +1,6 @@
 
 import torch
-from llama import ModelArgs, Transformer, Tokenizer, LLaMA
+from .model import ModelArgs, Transformer
 import torch
 
 torch.backends.cudnn.deterministic = False
diff --git a/torchbenchmark/models/llama/llama/tokenizer.py b/torchbenchmark/models/llama/tokenizer.py
similarity index 100%
rename from torchbenchmark/models/llama/llama/tokenizer.py
rename to torchbenchmark/models/llama/tokenizer.py

From 0ca561fc73379bed6be726a1e0c1016c8ed6299b Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Wed, 8 Mar 2023 21:59:37 +0000
Subject: [PATCH 12/26] fixed CI issues

---
 torchbenchmark/models/llama/__init__.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 2598eb8a76..940a4cccf5 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -23,9 +23,9 @@ def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra
 
         # TODO: Implement batching
 
-        # if torch.cuda.is_available():
-        #     torch.set_default_device('cuda')
-        self.example_inputs = torch.tensor([[1, 1], [1,1]], dtype=torch.int)
+        if device == "cuda":
+            torch.set_default_device("cuda")
+        self.example_inputs = [torch.tensor([[1, 1], [1,1]], dtype=torch.int)]
 
         
     def get_module(self):
@@ -43,7 +43,8 @@ def train(self):
     def eval(self):
         self.model.eval()
         with torch.no_grad():
-            out=self.model(self.example_inputs, 1)
+            for example_input in self.example_inputs:
+                out=self.model(example_input, 1)
         return (out,)
 
 

From 2033cddf0851d16306c45cc99d3670f05a365ed3 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Wed, 8 Mar 2023 22:11:15 +0000
Subject: [PATCH 13/26] made sure model runs on GPU

---
 torchbenchmark/models/llama/__init__.py | 1 +
 torchbenchmark/models/llama/model.py    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 940a4cccf5..1d0d813670 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -25,6 +25,7 @@ def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra
 
         if device == "cuda":
             torch.set_default_device("cuda")
+            self.model.to(torch.device("cuda"))
         self.example_inputs = [torch.tensor([[1, 1], [1,1]], dtype=torch.int)]
 
         
diff --git a/torchbenchmark/models/llama/model.py b/torchbenchmark/models/llama/model.py
index a4e1a482ad..974adc49a1 100644
--- a/torchbenchmark/models/llama/model.py
+++ b/torchbenchmark/models/llama/model.py
@@ -194,7 +194,7 @@ def __init__(self, params: ModelArgs):
         self.n_layers = params.n_layers
 
         self.tok_embeddings = nn.Embedding(
-            params.vocab_size + 1, params.dim
+            params.vocab_size + 1, params.dim,
         )
 
 

From 6b617f0cd142f30879252b1a9504f8fc7ce273e9 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Thu, 9 Mar 2023 21:31:11 +0000
Subject: [PATCH 14/26] pass

---
 torchbenchmark/models/llama/__init__.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 1d0d813670..1a5b478b13 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -9,9 +9,6 @@
 from .model import ModelArgs, Transformer
 import torch
 
-torch.backends.cudnn.deterministic = False
-torch.backends.cudnn.benchmark = True
-
 class Model(BenchmarkModel):
     DEFAULT_EVAL_BSIZE = 32
     task = NLP.LANGUAGE_MODELING
@@ -26,7 +23,7 @@ def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra
         if device == "cuda":
             torch.set_default_device("cuda")
             self.model.to(torch.device("cuda"))
-        self.example_inputs = [torch.tensor([[1, 1], [1,1]], dtype=torch.int)]
+        self.example_inputs = [(torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)]
 
         
     def get_module(self):
@@ -45,7 +42,7 @@ def eval(self):
         self.model.eval()
         with torch.no_grad():
             for example_input in self.example_inputs:
-                out=self.model(example_input, 1)
+                out=self.model(*example_input)
         return (out,)
 
 

From 6d3657450f0b2e5ae8575d71e3c539a12b3e04e5 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Thu, 9 Mar 2023 22:37:08 +0000
Subject: [PATCH 15/26] push

---
 torchbenchmark/models/llama/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 1a5b478b13..e42ab79664 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -10,12 +10,11 @@
 import torch
 
 class Model(BenchmarkModel):
-    DEFAULT_EVAL_BSIZE = 32
     task = NLP.LANGUAGE_MODELING
 
-    def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra_args=[]):
+    def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
-        self.model_args = ModelArgs(vocab_size=32) # TODO: Configuring arguments is breaking stuff: max_batch_size=batch_size, max_seq_len=1032 is breaking stuff
+        self.model_args = ModelArgs(vocab_size=32)
         self.model = Transformer(self.model_args)
 
         # TODO: Implement batching

From 2be2c07618078a83e7397eb8a0734dbad2a9ddef Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 00:54:28 +0000
Subject: [PATCH 16/26] update

---
 torchbenchmark/models/llama/__init__.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index e42ab79664..cda9dcfba0 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -11,13 +11,13 @@
 
 class Model(BenchmarkModel):
     task = NLP.LANGUAGE_MODELING
-
-    def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
-        super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
+    
+    # TODO: Implement batching
+    def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]):
+        super().__init__(test=test, device=device, jit=jit, batch_size=32, extra_args=extra_args)
         self.model_args = ModelArgs(vocab_size=32)
         self.model = Transformer(self.model_args)
 
-        # TODO: Implement batching
 
         if device == "cuda":
             torch.set_default_device("cuda")
@@ -41,7 +41,9 @@ def eval(self):
         self.model.eval()
         with torch.no_grad():
             for example_input in self.example_inputs:
-                out=self.model(*example_input)
+                tokens, start_pos = example_input
+                out=self.model(tokens, start_pos)
+                breakpoint()
         return (out,)
 
 

From db7690cca6fe6873d6cc28057222096cc76cd3e8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 00:54:54 +0000
Subject: [PATCH 17/26] upd

---
 torchbenchmark/models/llama/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index cda9dcfba0..664fa8047d 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -43,7 +43,6 @@ def eval(self):
             for example_input in self.example_inputs:
                 tokens, start_pos = example_input
                 out=self.model(tokens, start_pos)
-                breakpoint()
         return (out,)
 
 

From d039c6d1bfcc434c0dc32811c2bd7bf5ea5994c8 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 00:57:46 +0000
Subject: [PATCH 18/26] fixed test_llama_example_cuda

---
 torchbenchmark/models/llama/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 664fa8047d..7cd290f002 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -26,7 +26,7 @@ def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]):
 
         
     def get_module(self):
-        return self.model, self.example_inputs
+        return self.model, *self.example_inputs
     
     def train(self):
         error_msg = """
@@ -41,8 +41,7 @@ def eval(self):
         self.model.eval()
         with torch.no_grad():
             for example_input in self.example_inputs:
-                tokens, start_pos = example_input
-                out=self.model(tokens, start_pos)
+                out=self.model(*example_input)
         return (out,)
 
 

From 4ee4b71a7653ced337d7a53b6b49ad09c07e4d4b Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 00:58:53 +0000
Subject: [PATCH 19/26] clarify batching limitation

---
 torchbenchmark/models/llama/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 7cd290f002..c0621eabd9 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -12,7 +12,7 @@
 class Model(BenchmarkModel):
     task = NLP.LANGUAGE_MODELING
     
-    # TODO: Implement batching
+    # TODO: Implement batching, right if I set as None I get NotImplementedError: Test eval is not implemented.
     def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]):
         super().__init__(test=test, device=device, jit=jit, batch_size=32, extra_args=extra_args)
         self.model_args = ModelArgs(vocab_size=32)

From 17051bde15a55b985bb83c3e3495960913745bd4 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 17:40:22 +0000
Subject: [PATCH 20/26] Address Xu feedback

---
 torchbenchmark/models/llama/__init__.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index c0621eabd9..5d677ef52f 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -22,11 +22,11 @@ def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]):
         if device == "cuda":
             torch.set_default_device("cuda")
             self.model.to(torch.device("cuda"))
-        self.example_inputs = [(torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)]
+        self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)
 
         
     def get_module(self):
-        return self.model, *self.example_inputs
+        return self.model, self.example_inputs
     
     def train(self):
         error_msg = """
@@ -40,8 +40,7 @@ def train(self):
     def eval(self):
         self.model.eval()
         with torch.no_grad():
-            for example_input in self.example_inputs:
-                out=self.model(*example_input)
+            out=self.model(*self.example_inputs)
         return (out,)
 
 

From 2fc7143f75bc5df953d08be4f0119896d782b5c2 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 17:45:42 +0000
Subject: [PATCH 21/26] Added support for batching

---
 torchbenchmark/models/llama/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 5d677ef52f..98f239d155 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -12,9 +12,8 @@
 class Model(BenchmarkModel):
     task = NLP.LANGUAGE_MODELING
     
-    # TODO: Implement batching, right if I set as None I get NotImplementedError: Test eval is not implemented.
-    def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]):
-        super().__init__(test=test, device=device, jit=jit, batch_size=32, extra_args=extra_args)
+    def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
+        super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
         self.model_args = ModelArgs(vocab_size=32)
         self.model = Transformer(self.model_args)
 

From 89d3724b890cebbf8dd3e25b1fcf079f536d593a Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 20:34:15 +0000
Subject: [PATCH 22/26] push

---
 torchbenchmark/models/llama/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 98f239d155..e21265725b 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -20,7 +20,8 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
 
         if device == "cuda":
             torch.set_default_device("cuda")
-            self.model.to(torch.device("cuda"))
+        
+        self.model.to(torch.device(device))
         self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)
 
         

From 21e93c08ce90773a187d6a407cc35b1149e58dd5 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 12:44:52 -0800
Subject: [PATCH 23/26] Update torchbenchmark/models/llama/__init__.py

Co-authored-by: Xu Zhao <xzhao9@fb.com>
---
 torchbenchmark/models/llama/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index e21265725b..5b61470435 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -22,7 +22,7 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
             torch.set_default_device("cuda")
         
         self.model.to(torch.device(device))
-        self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)
+        self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int).to(self.device), 1)
 
         
     def get_module(self):

From 385431191b331c47c908d1fb3242ffa326570312 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 20:52:25 +0000
Subject: [PATCH 24/26] update

---
 torchbenchmark/models/llama/__init__.py  | 11 +--------
 torchbenchmark/models/llama/test.py      | 30 ------------------------
 torchbenchmark/models/llama/tokenizer.py |  4 ++--
 3 files changed, 3 insertions(+), 42 deletions(-)
 delete mode 100644 torchbenchmark/models/llama/test.py

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index e21265725b..704bda204a 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -16,10 +16,6 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
         self.model_args = ModelArgs(vocab_size=32)
         self.model = Transformer(self.model_args)
-
-
-        if device == "cuda":
-            torch.set_default_device("cuda")
         
         self.model.to(torch.device(device))
         self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)
@@ -41,9 +37,4 @@ def eval(self):
         self.model.eval()
         with torch.no_grad():
             out=self.model(*self.example_inputs)
-        return (out,)
-
-
-
-
-
+        return (out,)
\ No newline at end of file
diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py
deleted file mode 100644
index 30a9f9224c..0000000000
--- a/torchbenchmark/models/llama/test.py
+++ /dev/null
@@ -1,30 +0,0 @@
-
-import torch
-from .model import ModelArgs, Transformer
-import torch
-
-torch.backends.cudnn.deterministic = False
-torch.backends.cudnn.benchmark = True
-
-class Model:
-    def __init__(self, temperature: float = 0.8, top_p: float = 0.95):
-        self.model_args = ModelArgs()
-        self.generator = Transformer(self.model_args)
-        self.temperature = temperature
-        self.top_p = top_p
-    
-    def get_module(self):
-        return self.generator 
-    
-    def train(self):
-        return NotImplementedError
-
-    def eval(self):
-        return NotImplementedError
-
-if __name__ == "__main__":
-    model = Model()
-    module = model.get_module()
-    input_tensor = torch.tensor([[5, 1, 1], [1,1]], dtype=torch.int)
-
-    module(input_tensor, 1)
\ No newline at end of file
diff --git a/torchbenchmark/models/llama/tokenizer.py b/torchbenchmark/models/llama/tokenizer.py
index e4315856ee..95b14ce0fb 100644
--- a/torchbenchmark/models/llama/tokenizer.py
+++ b/torchbenchmark/models/llama/tokenizer.py
@@ -15,14 +15,14 @@ def __init__(self, model_path: str):
         # reload tokenizer
         assert os.path.isfile(model_path), model_path
         self.sp_model = SentencePieceProcessor(model_file=model_path)
-        logger.info(f"Reloaded SentencePiece model from {model_path}")
+        logger.debug(f"Reloaded SentencePiece model from {model_path}")
 
         # BOS / EOS token IDs
         self.n_words: int = self.sp_model.vocab_size()
         self.bos_id: int = self.sp_model.bos_id()
         self.eos_id: int = self.sp_model.eos_id()
         self.pad_id: int = self.sp_model.pad_id()
-        logger.info(
+        logger.debug(
             f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
         )
         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()

From faf928f22b3b1e294fecc3d6f7e6b6bb4344b8af Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 20:54:06 +0000
Subject: [PATCH 25/26] push

---
 torchbenchmark/models/llama/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index a386fec1b7..71f8f3969a 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -17,8 +17,8 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         self.model_args = ModelArgs(vocab_size=32)
         self.model = Transformer(self.model_args)
         
-        self.model.to(torch.device(device))
-        self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int).to(self.device), 1)
+        torch.set_default_device(device)
+        self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)
 
         
     def get_module(self):

From d8272886c4a5c81ff26b23b68fe8f65b8db14964 Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Fri, 10 Mar 2023 15:16:53 -0800
Subject: [PATCH 26/26] Update __init__.py

---
 torchbenchmark/models/llama/__init__.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py
index 71f8f3969a..9f17cae38b 100644
--- a/torchbenchmark/models/llama/__init__.py
+++ b/torchbenchmark/models/llama/__init__.py
@@ -15,10 +15,9 @@ class Model(BenchmarkModel):
     def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]):
         super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args)
         self.model_args = ModelArgs(vocab_size=32)
-        self.model = Transformer(self.model_args)
-        
-        torch.set_default_device(device)
-        self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)
+        torch.set_default_device(self.device)
+        self.model = Transformer(self.model_args).to(self.device)
+        self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int).to(self.device), 1)
 
         
     def get_module(self):
@@ -37,4 +36,4 @@ def eval(self):
         self.model.eval()
         with torch.no_grad():
             out=self.model(*self.example_inputs)
-        return (out,)
\ No newline at end of file
+        return (out,)