From 0332c3471a280c2f79e5af7ff8e8763acebfb551 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 3 Mar 2023 00:49:30 +0000 Subject: [PATCH 01/26] LLAMA --- torchbenchmark/models/llama | 1 + 1 file changed, 1 insertion(+) create mode 160000 torchbenchmark/models/llama diff --git a/torchbenchmark/models/llama b/torchbenchmark/models/llama new file mode 160000 index 0000000000..76066b1b5c --- /dev/null +++ b/torchbenchmark/models/llama @@ -0,0 +1 @@ +Subproject commit 76066b1b5cf467ce750f51af15cd34de442185e7 From 2242fc7482d07ec20999de678f15757e2165b37f Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 3 Mar 2023 00:54:19 +0000 Subject: [PATCH 02/26] Add LLAMA --- torchbenchmark/models/llama | 1 - torchbenchmark/models/llama/__init__.py | 43 ++++ torchbenchmark/models/llama/llama/__init__.py | 6 + .../models/llama/llama/generation.py | 77 ++++++ torchbenchmark/models/llama/llama/model.py | 240 ++++++++++++++++++ .../models/llama/llama/tokenizer.py | 40 +++ torchbenchmark/models/llama/metadata.yaml | 8 + torchbenchmark/models/llama/origin | 1 + torchbenchmark/models/llama/requirements.txt | 3 + 9 files changed, 418 insertions(+), 1 deletion(-) delete mode 160000 torchbenchmark/models/llama create mode 100644 torchbenchmark/models/llama/__init__.py create mode 100644 torchbenchmark/models/llama/llama/__init__.py create mode 100644 torchbenchmark/models/llama/llama/generation.py create mode 100644 torchbenchmark/models/llama/llama/model.py create mode 100644 torchbenchmark/models/llama/llama/tokenizer.py create mode 100644 torchbenchmark/models/llama/metadata.yaml create mode 100644 torchbenchmark/models/llama/origin create mode 100644 torchbenchmark/models/llama/requirements.txt diff --git a/torchbenchmark/models/llama b/torchbenchmark/models/llama deleted file mode 160000 index 76066b1b5c..0000000000 --- a/torchbenchmark/models/llama +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 76066b1b5cf467ce750f51af15cd34de442185e7 diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py new file mode 100644 index 0000000000..4c7c4bff4b --- /dev/null +++ b/torchbenchmark/models/llama/__init__.py @@ -0,0 +1,43 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the GNU General Public License version 3. + + + +from ...util.model import BenchmarkModel +from typing import Tuple +import os +import sys +import torch +import fire +import time +import json + +from pathlib import Path + +from fairscale.nn.model_parallel.initialize import initialize_model_parallel + +from llama import ModelArgs, Transformer, Tokenizer, LLaMA +import torch + +torch.backends.cudnn.deterministic = False +torch.backends.cudnn.benchmark = True + +class Model(BenchmarkModel): + def __init__(self, temperature: float = 0.8, top_p: float = 0.95): + self.model_args = ModelArgs() + self.generator = Transformer(self.model_args) + self.temperature = temperature + self.top_p = top_p + + def inference(self, prompts : str): + prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] + results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p) + + for result in results: + print(result) + print("\n==================================\n") + + + + + diff --git a/torchbenchmark/models/llama/llama/__init__.py b/torchbenchmark/models/llama/llama/__init__.py new file mode 100644 index 0000000000..009c144d21 --- /dev/null +++ b/torchbenchmark/models/llama/llama/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the GNU General Public License version 3. + +from .generation import LLaMA +from .model import ModelArgs, Transformer +from .tokenizer import Tokenizer \ No newline at end of file diff --git a/torchbenchmark/models/llama/llama/generation.py b/torchbenchmark/models/llama/llama/generation.py new file mode 100644 index 0000000000..4a23735d2f --- /dev/null +++ b/torchbenchmark/models/llama/llama/generation.py @@ -0,0 +1,77 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the GNU General Public License version 3. + +from typing import List + +import torch + +from llama.tokenizer import Tokenizer +from llama.model import Transformer + + +class LLaMA: + def __init__(self, model: Transformer, tokenizer: Tokenizer): + self.model = model + self.tokenizer = tokenizer + + def generate( + self, + prompts: List[str], + max_gen_len: int, + temperature: float = 0.8, + top_p: float = 0.95, + ) -> List[str]: + bsz = len(prompts) + params = self.model.params + assert bsz <= params.max_batch_size, (bsz, params.max_batch_size) + + prompt_tokens = [self.tokenizer.encode(x, bos=True, eos=False) for x in prompts] + + min_prompt_size = min([len(t) for t in prompt_tokens]) + max_prompt_size = max([len(t) for t in prompt_tokens]) + + total_len = min(params.max_seq_len, max_gen_len + max_prompt_size) + + tokens = torch.full((bsz, total_len), self.tokenizer.pad_id).cuda().long() + for k, t in enumerate(prompt_tokens): + tokens[k, : len(t)] = torch.tensor(t).long() + input_text_mask = tokens != self.tokenizer.pad_id + start_pos = min_prompt_size + prev_pos = 0 + for cur_pos in range(start_pos, total_len): + logits = self.model.forward(tokens[:, prev_pos:cur_pos], prev_pos) + if temperature > 0: + probs = torch.softmax(logits / temperature, dim=-1) + next_token = sample_top_p(probs, top_p) + else: + next_token = torch.argmax(logits, dim=-1) + next_token = next_token.reshape(-1) + # only replace token if prompt has already been generated + next_token = torch.where( + input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token + ) + tokens[:, cur_pos] = next_token + prev_pos = cur_pos + + decoded = [] + for i, t in enumerate(tokens.tolist()): + # cut to max gen len + t = t[: len(prompt_tokens[i]) + max_gen_len] + # cut to eos tok if any + try: + t = t[: t.index(self.tokenizer.eos_id)] + except ValueError: + pass + decoded.append(self.tokenizer.decode(t)) + return decoded + + +def sample_top_p(probs, p): + probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) + probs_sum = torch.cumsum(probs_sort, dim=-1) + mask = probs_sum - probs_sort > p + probs_sort[mask] = 0.0 + probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True)) + next_token = torch.multinomial(probs_sort, num_samples=1) + next_token = torch.gather(probs_idx, -1, next_token) + return next_token \ No newline at end of file diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/llama/model.py new file mode 100644 index 0000000000..52e0c04379 --- /dev/null +++ b/torchbenchmark/models/llama/llama/model.py @@ -0,0 +1,240 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the GNU General Public License version 3. + +from typing import Optional, Tuple +from dataclasses import dataclass +import math + +import torch +from torch import nn +import torch.nn.functional as F + +import fairscale.nn.model_parallel.initialize as fs_init +from fairscale.nn.model_parallel.layers import ( + ParallelEmbedding, + RowParallelLinear, + ColumnParallelLinear, +) + + +@dataclass +class ModelArgs: + dim: int = 512 + n_layers: int = 8 + n_heads: int = 8 + vocab_size: int = -1 # defined later by tokenizer + multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 + norm_eps: float = 1e-5 + + max_batch_size: int = 32 + max_seq_len: int = 1024 + + +class RMSNorm(torch.nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.ones(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()).type_as(x) + return output * self.weight + + +def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0): + freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) + t = torch.arange(end, device=freqs.device) # type: ignore + freqs = torch.outer(t, freqs).float() # type: ignore + freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64 + return freqs_cis + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + ndim = x.ndim + assert 0 <= 1 < ndim + assert freqs_cis.shape == (x.shape[1], x.shape[-1]) + shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_emb( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, +) -> Tuple[torch.Tensor, torch.Tensor]: + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq), xk_out.type_as(xk) + + +class Attention(nn.Module): + def __init__(self, args: ModelArgs): + super().__init__() + + self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size() + self.head_dim = args.dim // args.n_heads + + self.wq = ColumnParallelLinear( + args.dim, + args.n_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.wk = ColumnParallelLinear( + args.dim, + args.n_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.wv = ColumnParallelLinear( + args.dim, + args.n_heads * self.head_dim, + bias=False, + gather_output=False, + init_method=lambda x: x, + ) + self.wo = RowParallelLinear( + args.n_heads * self.head_dim, + args.dim, + bias=False, + input_is_parallel=True, + init_method=lambda x: x, + ) + + self.cache_k = torch.zeros( + (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) + ).cuda() + self.cache_v = torch.zeros( + (args.max_batch_size, args.max_seq_len, self.n_local_heads, self.head_dim) + ).cuda() + + def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]): + bsz, seqlen, _ = x.shape + xq, xk, xv = self.wq(x), self.wk(x), self.wv(x) + + xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim) + xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim) + xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim) + + xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis) + + self.cache_k = self.cache_k.to(xq) + self.cache_v = self.cache_v.to(xq) + + self.cache_k[:bsz, start_pos : start_pos + seqlen] = xk + self.cache_v[:bsz, start_pos : start_pos + seqlen] = xv + + keys = self.cache_k[:bsz, : start_pos + seqlen] + values = self.cache_v[:bsz, : start_pos + seqlen] + + xq = xq.transpose(1, 2) + keys = keys.transpose(1, 2) + values = values.transpose(1, 2) + scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim) + if mask is not None: + scores = scores + mask # (bs, n_local_heads, slen, cache_len + slen) + scores = F.softmax(scores.float(), dim=-1).type_as(xq) + output = torch.matmul(scores, values) # (bs, n_local_heads, slen, head_dim) + output = output.transpose( + 1, 2 + ).contiguous().view(bsz, seqlen, -1) + + return self.wo(output) + + +class FeedForward(nn.Module): + def __init__( + self, + dim: int, + hidden_dim: int, + multiple_of: int, + ): + super().__init__() + hidden_dim = int(2 * hidden_dim / 3) + hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) + + self.w1 = ColumnParallelLinear( + dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x + ) + self.w2 = RowParallelLinear( + hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x + ) + self.w3 = ColumnParallelLinear( + dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x + ) + + def forward(self, x): + return self.w2(F.silu(self.w1(x)) * self.w3(x)) + + +class TransformerBlock(nn.Module): + def __init__(self, layer_id: int, args: ModelArgs): + super().__init__() + self.n_heads = args.n_heads + self.dim = args.dim + self.head_dim = args.dim // args.n_heads + self.attention = Attention(args) + self.feed_forward = FeedForward( + dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of + ) + self.layer_id = layer_id + self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps) + self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps) + + def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]): + h = x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask) + out = h + self.feed_forward.forward(self.ffn_norm(h)) + return out + + +class Transformer(nn.Module): + def __init__(self, params: ModelArgs): + super().__init__() + self.params = params + self.vocab_size = params.vocab_size + self.n_layers = params.n_layers + + self.tok_embeddings = ParallelEmbedding( + params.vocab_size, params.dim, init_method=lambda x: x + ) + + self.layers = torch.nn.ModuleList() + for layer_id in range(params.n_layers): + self.layers.append(TransformerBlock(layer_id, params)) + + self.norm = RMSNorm(params.dim, eps=params.norm_eps) + self.output = ColumnParallelLinear( + params.dim, params.vocab_size, bias=False, init_method=lambda x: x + ) + + self.freqs_cis = precompute_freqs_cis( + self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 + ) + + # TODO: Not sure if we should keep or remove inference mode + # But OP is using it + # @torch.inference_mode() + def forward(self, tokens: torch.Tensor, start_pos: int): + _bsz, seqlen = tokens.shape + h = self.tok_embeddings(tokens) + self.freqs_cis = self.freqs_cis.to(h.device) + freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] + + mask = None + if seqlen > 1: + mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device) + mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h) + + for layer in self.layers: + h = layer(h, start_pos, freqs_cis, mask) + h = self.norm(h) + output = self.output(h[:, -1, :]) # only compute last logits + return output.float() diff --git a/torchbenchmark/models/llama/llama/tokenizer.py b/torchbenchmark/models/llama/llama/tokenizer.py new file mode 100644 index 0000000000..e4315856ee --- /dev/null +++ b/torchbenchmark/models/llama/llama/tokenizer.py @@ -0,0 +1,40 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the GNU General Public License version 3. + +from sentencepiece import SentencePieceProcessor +from logging import getLogger +from typing import List +import os + + +logger = getLogger() + + +class Tokenizer: + def __init__(self, model_path: str): + # reload tokenizer + assert os.path.isfile(model_path), model_path + self.sp_model = SentencePieceProcessor(model_file=model_path) + logger.info(f"Reloaded SentencePiece model from {model_path}") + + # BOS / EOS token IDs + self.n_words: int = self.sp_model.vocab_size() + self.bos_id: int = self.sp_model.bos_id() + self.eos_id: int = self.sp_model.eos_id() + self.pad_id: int = self.sp_model.pad_id() + logger.info( + f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" + ) + assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() + + def encode(self, s: str, bos: bool, eos: bool) -> List[int]: + assert type(s) is str + t = self.sp_model.encode(s) + if bos: + t = [self.bos_id] + t + if eos: + t = t + [self.eos_id] + return t + + def decode(self, t: List[int]) -> str: + return self.sp_model.decode(t) diff --git a/torchbenchmark/models/llama/metadata.yaml b/torchbenchmark/models/llama/metadata.yaml new file mode 100644 index 0000000000..cc072bb909 --- /dev/null +++ b/torchbenchmark/models/llama/metadata.yaml @@ -0,0 +1,8 @@ +devices: + NVIDIA A100-SXM4-40GB: + eval_batch_size: 1024 +eval_benchmark: false +eval_deterministic: false +eval_nograd: true +train_benchmark: false +train_deterministic: false \ No newline at end of file diff --git a/torchbenchmark/models/llama/origin b/torchbenchmark/models/llama/origin new file mode 100644 index 0000000000..3446020c5c --- /dev/null +++ b/torchbenchmark/models/llama/origin @@ -0,0 +1 @@ +origin https://github.com/facebookresearch/llama \ No newline at end of file diff --git a/torchbenchmark/models/llama/requirements.txt b/torchbenchmark/models/llama/requirements.txt new file mode 100644 index 0000000000..bfda692406 --- /dev/null +++ b/torchbenchmark/models/llama/requirements.txt @@ -0,0 +1,3 @@ +fairscale +fire +sentencepiece \ No newline at end of file From c0468f43b497d85fd75aa963a293dfec91939df7 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 3 Mar 2023 01:34:17 +0000 Subject: [PATCH 03/26] add intall.py --- torchbenchmark/models/llama/install.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 torchbenchmark/models/llama/install.py diff --git a/torchbenchmark/models/llama/install.py b/torchbenchmark/models/llama/install.py new file mode 100644 index 0000000000..d6baff0571 --- /dev/null +++ b/torchbenchmark/models/llama/install.py @@ -0,0 +1,8 @@ +import subprocess +import sys + +def pip_install_requirements(): + subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt']) + +if __name__ == '__main__': + pip_install_requirements() \ No newline at end of file From 54b82af55cd5aa2d691fffa1c0388f23db2b99b4 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 6 Mar 2023 18:39:47 +0000 Subject: [PATCH 04/26] Fixed some stuff --- torchbenchmark/models/llama/__init__.py | 34 +++++++------- torchbenchmark/models/llama/llama/model.py | 52 +++++++++------------- torchbenchmark/models/llama/test.py | 37 +++++++++++++++ 3 files changed, 75 insertions(+), 48 deletions(-) create mode 100644 torchbenchmark/models/llama/test.py diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 4c7c4bff4b..7f1de280b5 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -4,18 +4,7 @@ from ...util.model import BenchmarkModel -from typing import Tuple -import os -import sys import torch -import fire -import time -import json - -from pathlib import Path - -from fairscale.nn.model_parallel.initialize import initialize_model_parallel - from llama import ModelArgs, Transformer, Tokenizer, LLaMA import torch @@ -28,14 +17,25 @@ def __init__(self, temperature: float = 0.8, top_p: float = 0.95): self.generator = Transformer(self.model_args) self.temperature = temperature self.top_p = top_p + + + + # def inference(self, prompts : str): + # prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] + # results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p) + + # for result in results: + # print(result) + # print("\n==================================\n") + + def get_module(self): + return self.generator, - def inference(self, prompts : str): - prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] - results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p) + def train(self): + return NotImplementedError - for result in results: - print(result) - print("\n==================================\n") + def eval(self): + return NotImplementedError diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/llama/model.py index 52e0c04379..b5b62b8c68 100644 --- a/torchbenchmark/models/llama/llama/model.py +++ b/torchbenchmark/models/llama/llama/model.py @@ -9,13 +9,6 @@ from torch import nn import torch.nn.functional as F -import fairscale.nn.model_parallel.initialize as fs_init -from fairscale.nn.model_parallel.layers import ( - ParallelEmbedding, - RowParallelLinear, - ColumnParallelLinear, -) - @dataclass class ModelArgs: @@ -77,36 +70,32 @@ class Attention(nn.Module): def __init__(self, args: ModelArgs): super().__init__() - self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size() + self.n_local_heads = args.n_heads # Basically we just assume world size of 1 // fs_init.get_model_parallel_world_size() self.head_dim = args.dim // args.n_heads - self.wq = ColumnParallelLinear( + self.wq = nn.Linear( args.dim, args.n_heads * self.head_dim, bias=False, - gather_output=False, - init_method=lambda x: x, + ) - self.wk = ColumnParallelLinear( + self.wk = nn.Linear( args.dim, args.n_heads * self.head_dim, bias=False, - gather_output=False, - init_method=lambda x: x, + ) - self.wv = ColumnParallelLinear( + self.wv = nn.Linear( args.dim, args.n_heads * self.head_dim, bias=False, - gather_output=False, - init_method=lambda x: x, + ) - self.wo = RowParallelLinear( + self.wo = nn.Linear( args.n_heads * self.head_dim, args.dim, bias=False, - input_is_parallel=True, - init_method=lambda x: x, + ) self.cache_k = torch.zeros( @@ -161,14 +150,14 @@ def __init__( hidden_dim = int(2 * hidden_dim / 3) hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of) - self.w1 = ColumnParallelLinear( - dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x + self.w1 = nn.Linear( + dim, hidden_dim, bias=False ) - self.w2 = RowParallelLinear( - hidden_dim, dim, bias=False, input_is_parallel=True, init_method=lambda x: x + self.w2 = nn.Linear( + hidden_dim, dim, bias=False ) - self.w3 = ColumnParallelLinear( - dim, hidden_dim, bias=False, gather_output=False, init_method=lambda x: x + self.w3 = nn.Linear( + dim, hidden_dim, bias=False ) def forward(self, x): @@ -202,17 +191,18 @@ def __init__(self, params: ModelArgs): self.vocab_size = params.vocab_size self.n_layers = params.n_layers - self.tok_embeddings = ParallelEmbedding( - params.vocab_size, params.dim, init_method=lambda x: x - ) + ## Commenting otherwise I get a model parallel group is not initialized error + # self.tok_embeddings = ParallelEmbedding( + # params.vocab_size, params.dim, init_method=lambda x: x + # ) self.layers = torch.nn.ModuleList() for layer_id in range(params.n_layers): self.layers.append(TransformerBlock(layer_id, params)) self.norm = RMSNorm(params.dim, eps=params.norm_eps) - self.output = ColumnParallelLinear( - params.dim, params.vocab_size, bias=False, init_method=lambda x: x + self.output = nn.Linear( + params.dim, params.vocab_size + 1, bias=False ) self.freqs_cis = precompute_freqs_cis( diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py new file mode 100644 index 0000000000..f8f1e2e54b --- /dev/null +++ b/torchbenchmark/models/llama/test.py @@ -0,0 +1,37 @@ + +import torch +from llama import ModelArgs, Transformer, Tokenizer, LLaMA +import torch + +torch.backends.cudnn.deterministic = False +torch.backends.cudnn.benchmark = True + +class Model: + def __init__(self, temperature: float = 0.8, top_p: float = 0.95): + self.model_args = ModelArgs() + self.generator = Transformer(self.model_args) + self.temperature = temperature + self.top_p = top_p + + + + # def inference(self, prompts : str): + # prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] + # results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p) + + # for result in results: + # print(result) + # print("\n==================================\n") + + def get_module(self): + return self.generator + + def train(self): + return NotImplementedError + + def eval(self): + return NotImplementedError + +if __name__ == "__main__": + model = Model() + model.get_module().generate((torch.randn(1,1,1,1))) \ No newline at end of file From b451954425196a7e0e9b28a7e356eb1dad262237 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 6 Mar 2023 19:44:19 +0000 Subject: [PATCH 05/26] test now runs --- torchbenchmark/models/llama/llama/model.py | 25 ++++++++++++---------- torchbenchmark/models/llama/test.py | 15 ++++--------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/llama/model.py index b5b62b8c68..7fa6d8a614 100644 --- a/torchbenchmark/models/llama/llama/model.py +++ b/torchbenchmark/models/llama/llama/model.py @@ -15,7 +15,7 @@ class ModelArgs: dim: int = 512 n_layers: int = 8 n_heads: int = 8 - vocab_size: int = -1 # defined later by tokenizer + vocab_size: int = 32 # TODO: This was -1 I changed it to make things work multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 norm_eps: float = 1e-5 @@ -128,8 +128,10 @@ def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask keys = keys.transpose(1, 2) values = values.transpose(1, 2) scores = torch.matmul(xq, keys.transpose(2, 3)) / math.sqrt(self.head_dim) - if mask is not None: - scores = scores + mask # (bs, n_local_heads, slen, cache_len + slen) + + # TODO: RuntimeError: The size of tensor a (3) must match the size of tensor b (2) at non-singleton dimension 3 + # if mask is not None: + # scores = scores + mask # (bs, n_local_heads, slen, cache_len + slen) scores = F.softmax(scores.float(), dim=-1).type_as(xq) output = torch.matmul(scores, values) # (bs, n_local_heads, slen, head_dim) output = output.transpose( @@ -191,10 +193,10 @@ def __init__(self, params: ModelArgs): self.vocab_size = params.vocab_size self.n_layers = params.n_layers - ## Commenting otherwise I get a model parallel group is not initialized error - # self.tok_embeddings = ParallelEmbedding( - # params.vocab_size, params.dim, init_method=lambda x: x - # ) + self.tok_embeddings = nn.Embedding( + params.vocab_size + 1, params.dim + ) + self.layers = torch.nn.ModuleList() for layer_id in range(params.n_layers): @@ -209,16 +211,17 @@ def __init__(self, params: ModelArgs): self.params.dim // self.params.n_heads, self.params.max_seq_len * 2 ) - # TODO: Not sure if we should keep or remove inference mode - # But OP is using it - # @torch.inference_mode() def forward(self, tokens: torch.Tensor, start_pos: int): - _bsz, seqlen = tokens.shape + _ , seqlen = tokens.shape + + h = self.tok_embeddings(tokens) + self.freqs_cis = self.freqs_cis.to(h.device) freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen] mask = None + if seqlen > 1: mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device) mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h) diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py index f8f1e2e54b..663c77cc86 100644 --- a/torchbenchmark/models/llama/test.py +++ b/torchbenchmark/models/llama/test.py @@ -12,16 +12,6 @@ def __init__(self, temperature: float = 0.8, top_p: float = 0.95): self.generator = Transformer(self.model_args) self.temperature = temperature self.top_p = top_p - - - - # def inference(self, prompts : str): - # prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] - # results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p) - - # for result in results: - # print(result) - # print("\n==================================\n") def get_module(self): return self.generator @@ -34,4 +24,7 @@ def eval(self): if __name__ == "__main__": model = Model() - model.get_module().generate((torch.randn(1,1,1,1))) \ No newline at end of file + module = model.get_module() + input_tensor = torch.tensor([[1, 1], [1,1]], dtype=torch.int) + + module(input_tensor, 1) \ No newline at end of file From b95af599e6472a947c0d11ad672f52e582ed8eb6 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 6 Mar 2023 19:55:32 +0000 Subject: [PATCH 06/26] fix model --- torchbenchmark/models/llama/__init__.py | 31 ++++++++++++------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 7f1de280b5..bf354dea62 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -12,30 +12,29 @@ torch.backends.cudnn.benchmark = True class Model(BenchmarkModel): - def __init__(self, temperature: float = 0.8, top_p: float = 0.95): + def __init__(self): self.model_args = ModelArgs() - self.generator = Transformer(self.model_args) - self.temperature = temperature - self.top_p = top_p + self.model = Transformer(self.model_args) + self.example_inputs = torch.tensor([[1, 1], [1,1]], dtype=torch.int) - - # def inference(self, prompts : str): - # prompts = ["The capital of Germany is the city of", "Here is my sonnet in the style of Shakespeare about an artificial intelligence:"] - # results = self.generator.generate(prompts, max_gen_len=256, temperature=self.temperature, top_p=self.top_p) - - # for result in results: - # print(result) - # print("\n==================================\n") - def get_module(self): - return self.generator, + return self.transformer, self.example_inputs def train(self): - return NotImplementedError + error_msg = """ + As of March 6, 2023 + The weights for this model are not publicly available and require a valid research reason to use + The publicly available github repo is inference only + https://github.com/facebookresearch/llama + """ + return NotImplementedError(error_msg) def eval(self): - return NotImplementedError + self.model.eval() + with torch.no_grad(): + out=self.model(self.example_inputs, 1) + return (out,) From 1a7833a48e92f539753bfee246128ccdd3f8d678 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 6 Mar 2023 19:59:58 +0000 Subject: [PATCH 07/26] upd --- torchbenchmark/models/llama/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index bf354dea62..2b75c77ae0 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -19,7 +19,7 @@ def __init__(self): def get_module(self): - return self.transformer, self.example_inputs + return self.model, self.example_inputs def train(self): error_msg = """ From 5ca98df8b04a54b0d3fbf8d8e4cd1b5494b247da Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 6 Mar 2023 20:05:34 +0000 Subject: [PATCH 08/26] updat docs --- torchbenchmark/models/ADDING_MODELS.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/torchbenchmark/models/ADDING_MODELS.md b/torchbenchmark/models/ADDING_MODELS.md index 8fdb646a38..44cdb04096 100644 --- a/torchbenchmark/models/ADDING_MODELS.md +++ b/torchbenchmark/models/ADDING_MODELS.md @@ -30,6 +30,16 @@ Some of the APIs are optional, and you can raise NotImplemented if a particular Take care to set the random seed like [here](https://github.com/pytorch/benchmark/blob/master/torchbenchmark/models/Background_Matting/__init__.py#L20), to ensure your model runs the same way each time it's benchmarked. + +#### A minimal new model addition +A bare miminum example you can follow is https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models/phlippe_resnet + +The functions you specifically need to implement are +1. `__init__()` which is responsible for initalizing your `nn.Module` +2. `get_module()` which is responsible for returning the initialized `nn.Module` and an example input +3. `train()` which is a training loop, you can return a `NotImplementedError()` if your example is inference only +4. `eval()` which showcases a simple inference + ### Preparing install.py and dependencies Simply put, install.py should be a one stop shop to install all the dependencies for your model, __except torch, torchvision, torchtext__ which should be assumed to From aa0d4cb84cb9ed0fd79afcd3ca1c04fa10342ee9 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 6 Mar 2023 21:18:17 +0000 Subject: [PATCH 09/26] add stuff --- torchbenchmark/models/llama/__init__.py | 11 ++++++++--- torchbenchmark/models/llama/llama/model.py | 2 +- torchbenchmark/models/llama/test.py | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 2b75c77ae0..2008115617 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -4,6 +4,7 @@ from ...util.model import BenchmarkModel +from torchbenchmark.tasks import NLP import torch from llama import ModelArgs, Transformer, Tokenizer, LLaMA import torch @@ -12,10 +13,14 @@ torch.backends.cudnn.benchmark = True class Model(BenchmarkModel): - def __init__(self): - self.model_args = ModelArgs() + DEFAULT_EVAL_BSIZE = 128 + task = NLP.LANGUAGE_MODELING + + def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra_args=[]): + super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args) + self.model_args = ModelArgs(max_batch_size=batch_size, vocab_size=32, max_seq_len=1032) self.model = Transformer(self.model_args) - self.example_inputs = torch.tensor([[1, 1], [1,1]], dtype=torch.int) + self.example_inputs = torch.tensor([[batch_size, 1, 1], [batch_size, 1,1]], dtype=torch.int) def get_module(self): diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/llama/model.py index 7fa6d8a614..f91c875295 100644 --- a/torchbenchmark/models/llama/llama/model.py +++ b/torchbenchmark/models/llama/llama/model.py @@ -15,7 +15,7 @@ class ModelArgs: dim: int = 512 n_layers: int = 8 n_heads: int = 8 - vocab_size: int = 32 # TODO: This was -1 I changed it to make things work + vocab_size: int = -1 multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2 norm_eps: float = 1e-5 diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py index 663c77cc86..42f0dace59 100644 --- a/torchbenchmark/models/llama/test.py +++ b/torchbenchmark/models/llama/test.py @@ -25,6 +25,6 @@ def eval(self): if __name__ == "__main__": model = Model() module = model.get_module() - input_tensor = torch.tensor([[1, 1], [1,1]], dtype=torch.int) + input_tensor = torch.tensor([[5, 1, 1], [1,1]], dtype=torch.int) module(input_tensor, 1) \ No newline at end of file From 1a5f4d2c0c08c5d7959835cf1901159755b0f677 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 6 Mar 2023 21:45:25 +0000 Subject: [PATCH 10/26] minor fix --- torchbenchmark/models/llama/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 2008115617..f6193c6708 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -6,13 +6,13 @@ from ...util.model import BenchmarkModel from torchbenchmark.tasks import NLP import torch -from llama import ModelArgs, Transformer, Tokenizer, LLaMA +from llama import ModelArgs, Transformer import torch torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True -class Model(BenchmarkModel): +class LLAMA(BenchmarkModel): DEFAULT_EVAL_BSIZE = 128 task = NLP.LANGUAGE_MODELING From 7a7627fbb31a7fae1e884e49bd02933f0fe500d3 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 6 Mar 2023 22:19:30 +0000 Subject: [PATCH 11/26] flatten --- torchbenchmark/models/llama/__init__.py | 15 ++++++++++----- .../models/llama/{llama => }/generation.py | 4 ++-- torchbenchmark/models/llama/llama/__init__.py | 6 ------ torchbenchmark/models/llama/{llama => }/model.py | 1 - torchbenchmark/models/llama/requirements.txt | 2 -- torchbenchmark/models/llama/test.py | 2 +- .../models/llama/{llama => }/tokenizer.py | 0 7 files changed, 13 insertions(+), 17 deletions(-) rename torchbenchmark/models/llama/{llama => }/generation.py (97%) delete mode 100644 torchbenchmark/models/llama/llama/__init__.py rename torchbenchmark/models/llama/{llama => }/model.py (99%) rename torchbenchmark/models/llama/{llama => }/tokenizer.py (100%) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index f6193c6708..2598eb8a76 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -6,21 +6,26 @@ from ...util.model import BenchmarkModel from torchbenchmark.tasks import NLP import torch -from llama import ModelArgs, Transformer +from .model import ModelArgs, Transformer import torch torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = True -class LLAMA(BenchmarkModel): - DEFAULT_EVAL_BSIZE = 128 +class Model(BenchmarkModel): + DEFAULT_EVAL_BSIZE = 32 task = NLP.LANGUAGE_MODELING def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra_args=[]): super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args) - self.model_args = ModelArgs(max_batch_size=batch_size, vocab_size=32, max_seq_len=1032) + self.model_args = ModelArgs(vocab_size=32) # TODO: Configuring arguments is breaking stuff: max_batch_size=batch_size, max_seq_len=1032 is breaking stuff self.model = Transformer(self.model_args) - self.example_inputs = torch.tensor([[batch_size, 1, 1], [batch_size, 1,1]], dtype=torch.int) + + # TODO: Implement batching + + # if torch.cuda.is_available(): + # torch.set_default_device('cuda') + self.example_inputs = torch.tensor([[1, 1], [1,1]], dtype=torch.int) def get_module(self): diff --git a/torchbenchmark/models/llama/llama/generation.py b/torchbenchmark/models/llama/generation.py similarity index 97% rename from torchbenchmark/models/llama/llama/generation.py rename to torchbenchmark/models/llama/generation.py index 4a23735d2f..7f7d00914b 100644 --- a/torchbenchmark/models/llama/llama/generation.py +++ b/torchbenchmark/models/llama/generation.py @@ -5,8 +5,8 @@ import torch -from llama.tokenizer import Tokenizer -from llama.model import Transformer +from .tokenizer import Tokenizer +from .model import Transformer class LLaMA: diff --git a/torchbenchmark/models/llama/llama/__init__.py b/torchbenchmark/models/llama/llama/__init__.py deleted file mode 100644 index 009c144d21..0000000000 --- a/torchbenchmark/models/llama/llama/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# This software may be used and distributed according to the terms of the GNU General Public License version 3. - -from .generation import LLaMA -from .model import ModelArgs, Transformer -from .tokenizer import Tokenizer \ No newline at end of file diff --git a/torchbenchmark/models/llama/llama/model.py b/torchbenchmark/models/llama/model.py similarity index 99% rename from torchbenchmark/models/llama/llama/model.py rename to torchbenchmark/models/llama/model.py index f91c875295..a4e1a482ad 100644 --- a/torchbenchmark/models/llama/llama/model.py +++ b/torchbenchmark/models/llama/model.py @@ -214,7 +214,6 @@ def __init__(self, params: ModelArgs): def forward(self, tokens: torch.Tensor, start_pos: int): _ , seqlen = tokens.shape - h = self.tok_embeddings(tokens) self.freqs_cis = self.freqs_cis.to(h.device) diff --git a/torchbenchmark/models/llama/requirements.txt b/torchbenchmark/models/llama/requirements.txt index bfda692406..ad213956ee 100644 --- a/torchbenchmark/models/llama/requirements.txt +++ b/torchbenchmark/models/llama/requirements.txt @@ -1,3 +1 @@ -fairscale -fire sentencepiece \ No newline at end of file diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py index 42f0dace59..30a9f9224c 100644 --- a/torchbenchmark/models/llama/test.py +++ b/torchbenchmark/models/llama/test.py @@ -1,6 +1,6 @@ import torch -from llama import ModelArgs, Transformer, Tokenizer, LLaMA +from .model import ModelArgs, Transformer import torch torch.backends.cudnn.deterministic = False diff --git a/torchbenchmark/models/llama/llama/tokenizer.py b/torchbenchmark/models/llama/tokenizer.py similarity index 100% rename from torchbenchmark/models/llama/llama/tokenizer.py rename to torchbenchmark/models/llama/tokenizer.py From 0ca561fc73379bed6be726a1e0c1016c8ed6299b Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 8 Mar 2023 21:59:37 +0000 Subject: [PATCH 12/26] fixed CI issues --- torchbenchmark/models/llama/__init__.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 2598eb8a76..940a4cccf5 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -23,9 +23,9 @@ def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra # TODO: Implement batching - # if torch.cuda.is_available(): - # torch.set_default_device('cuda') - self.example_inputs = torch.tensor([[1, 1], [1,1]], dtype=torch.int) + if device == "cuda": + torch.set_default_device("cuda") + self.example_inputs = [torch.tensor([[1, 1], [1,1]], dtype=torch.int)] def get_module(self): @@ -43,7 +43,8 @@ def train(self): def eval(self): self.model.eval() with torch.no_grad(): - out=self.model(self.example_inputs, 1) + for example_input in self.example_inputs: + out=self.model(example_input, 1) return (out,) From 2033cddf0851d16306c45cc99d3670f05a365ed3 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 8 Mar 2023 22:11:15 +0000 Subject: [PATCH 13/26] made sure model runs on GPU --- torchbenchmark/models/llama/__init__.py | 1 + torchbenchmark/models/llama/model.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 940a4cccf5..1d0d813670 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -25,6 +25,7 @@ def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra if device == "cuda": torch.set_default_device("cuda") + self.model.to(torch.device("cuda")) self.example_inputs = [torch.tensor([[1, 1], [1,1]], dtype=torch.int)] diff --git a/torchbenchmark/models/llama/model.py b/torchbenchmark/models/llama/model.py index a4e1a482ad..974adc49a1 100644 --- a/torchbenchmark/models/llama/model.py +++ b/torchbenchmark/models/llama/model.py @@ -194,7 +194,7 @@ def __init__(self, params: ModelArgs): self.n_layers = params.n_layers self.tok_embeddings = nn.Embedding( - params.vocab_size + 1, params.dim + params.vocab_size + 1, params.dim, ) From 6b617f0cd142f30879252b1a9504f8fc7ce273e9 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 9 Mar 2023 21:31:11 +0000 Subject: [PATCH 14/26] pass --- torchbenchmark/models/llama/__init__.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 1d0d813670..1a5b478b13 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -9,9 +9,6 @@ from .model import ModelArgs, Transformer import torch -torch.backends.cudnn.deterministic = False -torch.backends.cudnn.benchmark = True - class Model(BenchmarkModel): DEFAULT_EVAL_BSIZE = 32 task = NLP.LANGUAGE_MODELING @@ -26,7 +23,7 @@ def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra if device == "cuda": torch.set_default_device("cuda") self.model.to(torch.device("cuda")) - self.example_inputs = [torch.tensor([[1, 1], [1,1]], dtype=torch.int)] + self.example_inputs = [(torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)] def get_module(self): @@ -45,7 +42,7 @@ def eval(self): self.model.eval() with torch.no_grad(): for example_input in self.example_inputs: - out=self.model(example_input, 1) + out=self.model(*example_input) return (out,) From 6d3657450f0b2e5ae8575d71e3c539a12b3e04e5 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Thu, 9 Mar 2023 22:37:08 +0000 Subject: [PATCH 15/26] push --- torchbenchmark/models/llama/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 1a5b478b13..e42ab79664 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -10,12 +10,11 @@ import torch class Model(BenchmarkModel): - DEFAULT_EVAL_BSIZE = 32 task = NLP.LANGUAGE_MODELING - def __init__(self, test, device, jit=False, batch_size=DEFAULT_EVAL_BSIZE, extra_args=[]): + def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]): super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args) - self.model_args = ModelArgs(vocab_size=32) # TODO: Configuring arguments is breaking stuff: max_batch_size=batch_size, max_seq_len=1032 is breaking stuff + self.model_args = ModelArgs(vocab_size=32) self.model = Transformer(self.model_args) # TODO: Implement batching From 2be2c07618078a83e7397eb8a0734dbad2a9ddef Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 00:54:28 +0000 Subject: [PATCH 16/26] update --- torchbenchmark/models/llama/__init__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index e42ab79664..cda9dcfba0 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -11,13 +11,13 @@ class Model(BenchmarkModel): task = NLP.LANGUAGE_MODELING - - def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]): - super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args) + + # TODO: Implement batching + def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]): + super().__init__(test=test, device=device, jit=jit, batch_size=32, extra_args=extra_args) self.model_args = ModelArgs(vocab_size=32) self.model = Transformer(self.model_args) - # TODO: Implement batching if device == "cuda": torch.set_default_device("cuda") @@ -41,7 +41,9 @@ def eval(self): self.model.eval() with torch.no_grad(): for example_input in self.example_inputs: - out=self.model(*example_input) + tokens, start_pos = example_input + out=self.model(tokens, start_pos) + breakpoint() return (out,) From db7690cca6fe6873d6cc28057222096cc76cd3e8 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 00:54:54 +0000 Subject: [PATCH 17/26] upd --- torchbenchmark/models/llama/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index cda9dcfba0..664fa8047d 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -43,7 +43,6 @@ def eval(self): for example_input in self.example_inputs: tokens, start_pos = example_input out=self.model(tokens, start_pos) - breakpoint() return (out,) From d039c6d1bfcc434c0dc32811c2bd7bf5ea5994c8 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 00:57:46 +0000 Subject: [PATCH 18/26] fixed test_llama_example_cuda --- torchbenchmark/models/llama/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 664fa8047d..7cd290f002 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -26,7 +26,7 @@ def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]): def get_module(self): - return self.model, self.example_inputs + return self.model, *self.example_inputs def train(self): error_msg = """ @@ -41,8 +41,7 @@ def eval(self): self.model.eval() with torch.no_grad(): for example_input in self.example_inputs: - tokens, start_pos = example_input - out=self.model(tokens, start_pos) + out=self.model(*example_input) return (out,) From 4ee4b71a7653ced337d7a53b6b49ad09c07e4d4b Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 00:58:53 +0000 Subject: [PATCH 19/26] clarify batching limitation --- torchbenchmark/models/llama/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 7cd290f002..c0621eabd9 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -12,7 +12,7 @@ class Model(BenchmarkModel): task = NLP.LANGUAGE_MODELING - # TODO: Implement batching + # TODO: Implement batching, right if I set as None I get NotImplementedError: Test eval is not implemented. def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]): super().__init__(test=test, device=device, jit=jit, batch_size=32, extra_args=extra_args) self.model_args = ModelArgs(vocab_size=32) From 17051bde15a55b985bb83c3e3495960913745bd4 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 17:40:22 +0000 Subject: [PATCH 20/26] Address Xu feedback --- torchbenchmark/models/llama/__init__.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index c0621eabd9..5d677ef52f 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -22,11 +22,11 @@ def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]): if device == "cuda": torch.set_default_device("cuda") self.model.to(torch.device("cuda")) - self.example_inputs = [(torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1)] + self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1) def get_module(self): - return self.model, *self.example_inputs + return self.model, self.example_inputs def train(self): error_msg = """ @@ -40,8 +40,7 @@ def train(self): def eval(self): self.model.eval() with torch.no_grad(): - for example_input in self.example_inputs: - out=self.model(*example_input) + out=self.model(*self.example_inputs) return (out,) From 2fc7143f75bc5df953d08be4f0119896d782b5c2 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 17:45:42 +0000 Subject: [PATCH 21/26] Added support for batching --- torchbenchmark/models/llama/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 5d677ef52f..98f239d155 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -12,9 +12,8 @@ class Model(BenchmarkModel): task = NLP.LANGUAGE_MODELING - # TODO: Implement batching, right if I set as None I get NotImplementedError: Test eval is not implemented. - def __init__(self, test, device, jit=False, batch_size=32, extra_args=[]): - super().__init__(test=test, device=device, jit=jit, batch_size=32, extra_args=extra_args) + def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]): + super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args) self.model_args = ModelArgs(vocab_size=32) self.model = Transformer(self.model_args) From 89d3724b890cebbf8dd3e25b1fcf079f536d593a Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 20:34:15 +0000 Subject: [PATCH 22/26] push --- torchbenchmark/models/llama/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 98f239d155..e21265725b 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -20,7 +20,8 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]): if device == "cuda": torch.set_default_device("cuda") - self.model.to(torch.device("cuda")) + + self.model.to(torch.device(device)) self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1) From 21e93c08ce90773a187d6a407cc35b1149e58dd5 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 12:44:52 -0800 Subject: [PATCH 23/26] Update torchbenchmark/models/llama/__init__.py Co-authored-by: Xu Zhao --- torchbenchmark/models/llama/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index e21265725b..5b61470435 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -22,7 +22,7 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]): torch.set_default_device("cuda") self.model.to(torch.device(device)) - self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1) + self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int).to(self.device), 1) def get_module(self): From 385431191b331c47c908d1fb3242ffa326570312 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 20:52:25 +0000 Subject: [PATCH 24/26] update --- torchbenchmark/models/llama/__init__.py | 11 +-------- torchbenchmark/models/llama/test.py | 30 ------------------------ torchbenchmark/models/llama/tokenizer.py | 4 ++-- 3 files changed, 3 insertions(+), 42 deletions(-) delete mode 100644 torchbenchmark/models/llama/test.py diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index e21265725b..704bda204a 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -16,10 +16,6 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]): super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args) self.model_args = ModelArgs(vocab_size=32) self.model = Transformer(self.model_args) - - - if device == "cuda": - torch.set_default_device("cuda") self.model.to(torch.device(device)) self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1) @@ -41,9 +37,4 @@ def eval(self): self.model.eval() with torch.no_grad(): out=self.model(*self.example_inputs) - return (out,) - - - - - + return (out,) \ No newline at end of file diff --git a/torchbenchmark/models/llama/test.py b/torchbenchmark/models/llama/test.py deleted file mode 100644 index 30a9f9224c..0000000000 --- a/torchbenchmark/models/llama/test.py +++ /dev/null @@ -1,30 +0,0 @@ - -import torch -from .model import ModelArgs, Transformer -import torch - -torch.backends.cudnn.deterministic = False -torch.backends.cudnn.benchmark = True - -class Model: - def __init__(self, temperature: float = 0.8, top_p: float = 0.95): - self.model_args = ModelArgs() - self.generator = Transformer(self.model_args) - self.temperature = temperature - self.top_p = top_p - - def get_module(self): - return self.generator - - def train(self): - return NotImplementedError - - def eval(self): - return NotImplementedError - -if __name__ == "__main__": - model = Model() - module = model.get_module() - input_tensor = torch.tensor([[5, 1, 1], [1,1]], dtype=torch.int) - - module(input_tensor, 1) \ No newline at end of file diff --git a/torchbenchmark/models/llama/tokenizer.py b/torchbenchmark/models/llama/tokenizer.py index e4315856ee..95b14ce0fb 100644 --- a/torchbenchmark/models/llama/tokenizer.py +++ b/torchbenchmark/models/llama/tokenizer.py @@ -15,14 +15,14 @@ def __init__(self, model_path: str): # reload tokenizer assert os.path.isfile(model_path), model_path self.sp_model = SentencePieceProcessor(model_file=model_path) - logger.info(f"Reloaded SentencePiece model from {model_path}") + logger.debug(f"Reloaded SentencePiece model from {model_path}") # BOS / EOS token IDs self.n_words: int = self.sp_model.vocab_size() self.bos_id: int = self.sp_model.bos_id() self.eos_id: int = self.sp_model.eos_id() self.pad_id: int = self.sp_model.pad_id() - logger.info( + logger.debug( f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" ) assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() From faf928f22b3b1e294fecc3d6f7e6b6bb4344b8af Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 20:54:06 +0000 Subject: [PATCH 25/26] push --- torchbenchmark/models/llama/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index a386fec1b7..71f8f3969a 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -17,8 +17,8 @@ def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]): self.model_args = ModelArgs(vocab_size=32) self.model = Transformer(self.model_args) - self.model.to(torch.device(device)) - self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int).to(self.device), 1) + torch.set_default_device(device) + self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1) def get_module(self): From d8272886c4a5c81ff26b23b68fe8f65b8db14964 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 10 Mar 2023 15:16:53 -0800 Subject: [PATCH 26/26] Update __init__.py --- torchbenchmark/models/llama/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/torchbenchmark/models/llama/__init__.py b/torchbenchmark/models/llama/__init__.py index 71f8f3969a..9f17cae38b 100644 --- a/torchbenchmark/models/llama/__init__.py +++ b/torchbenchmark/models/llama/__init__.py @@ -15,10 +15,9 @@ class Model(BenchmarkModel): def __init__(self, test, device, jit=False, batch_size=None, extra_args=[]): super().__init__(test=test, device=device, jit=jit, batch_size=batch_size, extra_args=extra_args) self.model_args = ModelArgs(vocab_size=32) - self.model = Transformer(self.model_args) - - torch.set_default_device(device) - self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int), 1) + torch.set_default_device(self.device) + self.model = Transformer(self.model_args).to(self.device) + self.example_inputs = (torch.tensor([[1, 1], [1,1]], dtype=torch.int).to(self.device), 1) def get_module(self): @@ -37,4 +36,4 @@ def eval(self): self.model.eval() with torch.no_grad(): out=self.model(*self.example_inputs) - return (out,) \ No newline at end of file + return (out,)