Skip to content

Commit

Permalink
Merge branch 'google-ai-edge:main' into fix_sample_inference_error
Browse files Browse the repository at this point in the history
  • Loading branch information
nigelzzzzzzz authored Sep 25, 2024
2 parents 12f69da + 9ae6590 commit 26bc0da
Show file tree
Hide file tree
Showing 16 changed files with 851 additions and 97 deletions.
10 changes: 7 additions & 3 deletions ai_edge_torch/generative/examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@ Gemma is Google's open-source LLM. The model has both a 2B and 7B versions. See
## TinyLlama
[TinyLlama](https://github.com/jzhang38/TinyLlama) is a popular OSS smaller version of Meta's Llama2 model, with only 1.1B parameters. [HuggingFace checkpoint](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0).

## Microsoft Phi-2
Microsoft Phi-2 is also a decoder-only LLM with 2.7B parameters, see details on
[Kaggle](https://www.kaggle.com/models/Microsoft/phi/transformers/2).
## Microsoft Phi-2 and 3.5-mini
Microsoft Phi-2 and Phi-3.5-mini are also decoder-only LLMs with 2.7B and 3.82B
parameters each. See details on
[Kaggle](https://www.kaggle.com/models/Microsoft/phi/transformers/2) for Phi-2
and [HuggingFace](https://huggingface.co/microsoft/Phi-3.5-mini-instruct) for
Phi-3.5-mini. Note that the example of Phi-3.5-mini supports up to 4K tokens,
not to 128K tokens which the original Phi-3.5 supports.

## Apple OpenELM
[Apple OpenELM](https://huggingface.co/apple/OpenELM) is also a decoder-only LLM
Expand Down
68 changes: 68 additions & 0 deletions ai_edge_torch/generative/examples/phi/convert_phi3_to_tflite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright 2024 The AI Edge Torch Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Example of converting a Phi-3.5 model to multi-signature tflite model."""

import os
import pathlib

from absl import app
from absl import flags
from ai_edge_torch.generative.examples.phi import phi3
from ai_edge_torch.generative.utilities import converter

_CHECKPOINT_PATH = flags.DEFINE_string(
'checkpoint_path',
os.path.join(pathlib.Path.home(), 'Downloads/llm_data/phi3'),
'The path to the model checkpoint, or directory holding the checkpoint.',
)
_TFLITE_PATH = flags.DEFINE_string(
'tflite_path',
'/tmp/',
'The tflite file path to export.',
)
_PREFILL_SEQ_LEN = flags.DEFINE_integer(
'prefill_seq_len',
1024,
'The maximum size of prefill input tensor.',
)
_KV_CACHE_MAX_LEN = flags.DEFINE_integer(
'kv_cache_max_len',
1280,
'The maximum size of KV cache buffer, including both prefill and decode.',
)
_QUANTIZE = flags.DEFINE_bool(
'quantize',
True,
'Whether the model should be quantized.',
)


def main(_):
pytorch_model = phi3.build_model(
_CHECKPOINT_PATH.value, kv_cache_max_len=_KV_CACHE_MAX_LEN.value
)
quant_suffix = 'q8' if _QUANTIZE.value else 'f32'
output_filename = f'phi3_{quant_suffix}_seq{_PREFILL_SEQ_LEN.value}_ekv{_KV_CACHE_MAX_LEN.value}.tflite'
converter.convert_to_tflite(
pytorch_model,
tflite_path=os.path.join(_TFLITE_PATH.value, output_filename),
prefill_seq_len=_PREFILL_SEQ_LEN.value,
quantize=_QUANTIZE.value,
)


if __name__ == '__main__':
app.run(main)
286 changes: 286 additions & 0 deletions ai_edge_torch/generative/examples/phi/phi3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
# Copyright 2024 The AI Edge Torch Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Example of building a Phi-3.5 model up to 4K tokens, not to 128K tokens."""

import math
from typing import Tuple

from ai_edge_torch.generative.layers import attention
from ai_edge_torch.generative.layers import builder
from ai_edge_torch.generative.layers import kv_cache as kv_utils
import ai_edge_torch.generative.layers.attention_utils as attn_utils
import ai_edge_torch.generative.layers.model_config as cfg
import ai_edge_torch.generative.utilities.loader as loading_utils
import torch
from torch import nn

TENSOR_NAMES = loading_utils.ModelLoader.TensorNames(
ff_up_proj="model.layers.{}.mlp.gate_up_proj",
ff_down_proj="model.layers.{}.mlp.down_proj",
attn_fused_qkv_proj="model.layers.{}.self_attn.qkv_proj",
attn_output_proj="model.layers.{}.self_attn.o_proj",
pre_attn_norm="model.layers.{}.input_layernorm",
post_attn_norm="model.layers.{}.post_attention_layernorm",
embedding="model.embed_tokens",
final_norm="model.norm",
lm_head="lm_head",
)

# max_position_embeddings / original_max_position_embeddings in Phi-3.5 config.
ROPE_SCALE_FACTOR = 32

# ROPE short factor in Phi-3.5 config. According to LOPE paper and its code in
# https://github.com/microsoft/LongRoPE, these values had been searched with
# min=1.0, step-0.01 to optimize the errors of sample dataset.
ROPE_SHORT_FACTOR = [
1.0,
1.0199999809265137,
1.0299999713897705,
1.0299999713897705,
1.0499999523162842,
1.0499999523162842,
1.0499999523162842,
1.0499999523162842,
1.0499999523162842,
1.0699999332427979,
1.0999999046325684,
1.1099998950958252,
1.1599998474121094,
1.1599998474121094,
1.1699998378753662,
1.2899998426437378,
1.339999794960022,
1.679999828338623,
1.7899998426437378,
1.8199998140335083,
1.8499997854232788,
1.8799997568130493,
1.9099997282028198,
1.9399996995925903,
1.9899996519088745,
2.0199997425079346,
2.0199997425079346,
2.0199997425079346,
2.0199997425079346,
2.0199997425079346,
2.0199997425079346,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0299997329711914,
2.0799996852874756,
2.0899996757507324,
2.189999580383301,
2.2199995517730713,
2.5899994373321533,
2.729999542236328,
2.749999523162842,
2.8399994373321533,
]


def build_rope_cache(
size: int,
dim: int,
base: int = 10000,
condense_ratio: int = 1,
dtype: torch.dtype = torch.float32,
device: torch.device = None,
theta_factors: torch.Tensor = None,
scale: float = 1.0,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Precomputes Rotary Positional Embeddings for Phi-3.5 model.
It's a modified version of attn_utils.build_rope_cache with additional
arguments for Phi-3.5 model. It precompute Rotary Positional Embedding Sin and
Cos values with scaling factors for quick lookup during the inference.
Args:
size (int): The size of the built cache.
dim (int): Each sequence's dimmension.
base (int, optional): Rope base value. Defaults to 10000.
condense_ratio (int, optional): The ratio by which sequence indicies are
condensed. Defaults to 1.
dtype (torch.dtype, optional): Output tensor's data type. Defaults to
torch.float32.
device (torch.device, optional): Output tensor's data type. Defaults to
None in which case "cpu" is used.
theta_factors (torch.Tensor, optional): A tensor of shape (dim,) used to
scale the theta values. Defaults to None.
scale (float, optional): A float used to scale the rope values. Defaults
to 1.0.
Returns:
Tuple[torch.Tensor, torch.Tensor]: Rope's Cosine and Sine waves.
"""
if device is None:
device = torch.device('cpu')
theta = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
if theta_factors is not None:
theta = theta / theta_factors
seq_idx = torch.arange(size) / condense_ratio
idx_theta = torch.outer(seq_idx, theta)
cos = torch.cos(idx_theta).to(dtype=dtype, device=device) * scale
sin = torch.sin(idx_theta).to(dtype=dtype, device=device) * scale
return cos, sin


class Phi3_5Mini(nn.Module):
"""A Phi-3.5 model built from the Edge Generative API layers."""

def __init__(self, config: cfg.ModelConfig):
super().__init__()

# Construct model layers.
self.lm_head = nn.Linear(
config.embedding_dim, config.vocab_size, bias=config.lm_head_use_bias
)
self.tok_embedding = nn.Embedding(
config.vocab_size, config.embedding_dim, padding_idx=0
)
# Phi-3.5 has only one block config.
block_config = config.block_config(0)
self.transformer_blocks = nn.ModuleList(
attention.TransformerBlock(block_config, config)
for _ in range(config.num_layers)
)
self.final_norm = builder.build_norm(
config.embedding_dim,
config.final_norm_config,
)
attn_config = block_config.attn_config
self.rope_cache = build_rope_cache(
size=config.kv_cache_max,
dim=int(attn_config.rotary_percentage * attn_config.head_dim),
base=10_000,
condense_ratio=1,
dtype=torch.float32,
device=torch.device("cpu"),
theta_factors=torch.tensor(ROPE_SHORT_FACTOR),
scale=math.sqrt(
1 + math.log(ROPE_SCALE_FACTOR) / math.log(config.max_seq_len)
),
)
self.mask_cache = attn_utils.build_causal_mask_cache(
size=config.kv_cache_max,
dtype=torch.float32,
device=torch.device("cpu"),
)
self.config = config

@torch.inference_mode
def forward(
self,
tokens: torch.Tensor,
input_pos: torch.Tensor,
kv_cache: kv_utils.KVCache,
) -> dict[torch.Tensor, kv_utils.KVCache]:
_, seq_len = tokens.size()
assert self.config.max_seq_len >= seq_len, (
f"Cannot forward sequence of length {seq_len}, max seq length is only"
f" {self.config.max_seq_len}"
)
assert len(self.transformer_blocks) == len(kv_cache.caches), (
"The number of transformer blocks and the number of KV cache entries"
" must be the same."
)

cos, sin = self.rope_cache
cos = cos.index_select(0, input_pos)
sin = sin.index_select(0, input_pos)
mask = self.mask_cache.index_select(2, input_pos)
mask = mask[:, :, :, : self.config.kv_cache_max]

x = self.tok_embedding(tokens)

updated_kv_entires = []
for i, block in enumerate(self.transformer_blocks):
kv_entry = kv_cache.caches[i] if kv_cache else None
x, kv_entry = block(x, (cos, sin), mask, input_pos, kv_entry)
if kv_entry:
updated_kv_entires.append(kv_entry)
updated_kv_cache = kv_utils.KVCache(tuple(updated_kv_entires))

x = self.final_norm(x)
logits = self.lm_head(x) # (b, t, vocab_size)
return {"logits": logits, "kv_cache": updated_kv_cache}


def get_model_config(kv_cache_max_len: int = 1024) -> cfg.ModelConfig:
"""Returns the model config for a Phi-3.5 model.
Args:
kv_cache_max_len (int): The maximum sequence length of the KV cache. Default
is 1024.
Returns:
The model config for a Phi-2 model.
"""
attn_config = cfg.AttentionConfig(
num_heads=32,
head_dim=96,
num_query_groups=32,
rotary_percentage=1.0,
qkv_transpose_before_split=True,
)
ff_config = cfg.FeedForwardConfig(
type=cfg.FeedForwardType.SEQUENTIAL,
activation=cfg.ActivationConfig(cfg.ActivationType.SILU_GLU),
intermediate_size=8192,
)
norm_config = cfg.NormalizationConfig(type=cfg.NormalizationType.RMS_NORM)
block_config = cfg.TransformerBlockConfig(
attn_config=attn_config,
ff_config=ff_config,
pre_attention_norm_config=norm_config,
post_attention_norm_config=norm_config,
)
config = cfg.ModelConfig(
vocab_size=32064,
num_layers=32,
max_seq_len=4096,
kv_cache_max_len=kv_cache_max_len,
embedding_dim=3072,
block_configs=block_config,
final_norm_config=norm_config,
enable_hlfb=True,
)
return config


def get_fake_model_config(kv_cache_max_len: int = 128) -> cfg.ModelConfig:
config = get_model_config(kv_cache_max_len)
config.vocab_size = 128
config.num_layers = 2
config.max_seq_len = 2 * kv_cache_max_len
# Phi-3.5 has only one block config.
config.block_config(0).ff_config.intermediate_size = 128
return config


def build_model(checkpoint_path: str, **kwargs) -> nn.Module:
"""Instantiates the model instance and load checkpoint if provided."""
config = get_model_config(**kwargs)
model = Phi3_5Mini(config)
loader = loading_utils.ModelLoader(checkpoint_path, TENSOR_NAMES)
loader.load(model)
model.eval()
return model
1 change: 0 additions & 1 deletion ai_edge_torch/generative/examples/phi/verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
"Instruct: Write an email about the weather Output:",
"The input prompts to generate answers.",
)

_MAX_NEW_TOKENS = flags.DEFINE_integer(
"max_new_tokens",
30,
Expand Down
Loading

0 comments on commit 26bc0da

Please sign in to comment.