Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Add support for Aya-23 8B Model by Cohere #2603

Merged
merged 14 commits into from
Aug 5, 2024
1 change: 1 addition & 0 deletions python/mlc_llm/conversation_template/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

# model preset templates
from . import (
cohere,
dolly,
gemma,
glm,
Expand Down
27 changes: 27 additions & 0 deletions python/mlc_llm/conversation_template/cohere.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Cohere default templates"""
# pylint: disable=line-too-long

# Referred from: https://huggingface.co/CohereForAI/aya-23-8B/blob/main/tokenizer_config.json

from mlc_llm.protocol.conversation_protocol import Conversation, MessagePlaceholders

from .registry import ConvTemplateRegistry

# Aya-23
ConvTemplateRegistry.register_conv_template(
Conversation(
name="aya-23",
system_template=f"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{MessagePlaceholders.SYSTEM.value}<|END_OF_TURN_TOKEN|>",
system_message="You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses.",
roles={
"user": "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>",
"assistant": "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
},
seps=["<|END_OF_TURN_TOKEN|>"],
role_content_sep="",
role_empty_sep="",
system_prefix_token_ids=[5],
stop_str=["<|END_OF_TURN_TOKEN|>"],
stop_token_ids=[6, 255001],
)
)
3 changes: 2 additions & 1 deletion python/mlc_llm/interface/gen_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def gen_config( # pylint: disable=too-many-locals,too-many-arguments,too-many-b
prefill_chunk_size=model_config.prefill_chunk_size,
attention_sink_size=getattr(model_config, "attention_sink_size", -1),
tensor_parallel_shards=model_config.tensor_parallel_shards,
conv_template=conversation,
conv_template=conversation, # type: ignore
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious why we disable mypy for this line?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PyLance was raising a reportArgumentType issue for this line, attached the screenshot below:

image

)
# Step 2. Load `generation_config.json` and `config.json` for text-generation related configs
for generation_config_filename in ["generation_config.json", "config.json"]:
Expand Down Expand Up @@ -299,4 +299,5 @@ def gen_config( # pylint: disable=too-many-locals,too-many-arguments,too-many-b
"llava",
"hermes2_pro_llama3",
"tinyllama_v1_0",
"aya-23",
}
Empty file.
172 changes: 172 additions & 0 deletions python/mlc_llm/model/cohere/cohere_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""
This file specifies how MLC's Cohere parameter maps from other formats, for example HuggingFace
PyTorch, HuggingFace safetensors.
"""

import functools

import numpy as np

from mlc_llm.loader import ExternMapping
from mlc_llm.quantization import Quantization

from .cohere_model import CohereConfig, CohereForCausalLM
from .cohere_quantization import awq_quant


def huggingface(model_config: CohereConfig, quantization: Quantization) -> ExternMapping:
"""Returns a parameter mapping that maps from the names of MLC LLM parameters to
the names of HuggingFace PyTorch parameters.

Parameters
----------
model_config : CohereConfig
The configuration of the Cohere model.

quantization : Quantization
The quantization configuration.

Returns
-------
param_map : ExternMapping
The parameter mapping from MLC to HuggingFace PyTorch.
"""
model = CohereForCausalLM(model_config)
if quantization is not None:
model.to(quantization.model_dtype)
_, _named_params, _ = model.export_tvm( # type: ignore[misc]
spec=model.get_default_spec(),
allow_extern=True,
)
named_parameters = dict(_named_params)

mapping = ExternMapping()

def _add(mlc_name, hf_name):
mapping.add_mapping(
mlc_name,
[hf_name],
functools.partial(
lambda x, dtype: x.astype(dtype),
dtype=named_parameters[mlc_name].dtype,
),
)

for i in range(model_config.num_hidden_layers):
# Add QKV in self attention
attn = f"model.layers.{i}.self_attn"
mlc_name = f"{attn}.qkv_proj.weight"
mlc_param = named_parameters[mlc_name]
_add(f"{attn}.out_proj.weight", f"{attn}.o_proj.weight")
mapping.add_mapping(
mlc_name,
[
f"{attn}.q_proj.weight",
f"{attn}.k_proj.weight",
f"{attn}.v_proj.weight",
],
functools.partial(
lambda q, k, v, dtype: np.concatenate([q, k, v], axis=0).astype(dtype),
dtype=mlc_param.dtype,
),
)
# Add gates in MLP
mlp = f"model.layers.{i}.mlp"
_add(f"{mlp}.up_proj.weight", f"{mlp}.up_proj.weight")
_add(f"{mlp}.gate_proj.weight", f"{mlp}.gate_proj.weight")
_add(f"{mlp}.down_proj.weight", f"{mlp}.down_proj.weight")
# inv_freq is not used in the model
# mapping.add_unused(f"{attn}.rotary_emb.inv_freq")

for mlc_name, mlc_param in named_parameters.items():
if mlc_name not in mapping.param_map:
mapping.add_mapping(
mlc_name,
[mlc_name],
functools.partial(
lambda x, dtype: x.astype(dtype),
dtype=mlc_param.dtype,
),
)

return mapping


# https://huggingface.co/alijawad07/aya-23-8B-AWQ-GEMM/tree/main
def awq(model_config: CohereConfig, quantization: Quantization) -> ExternMapping:
"""Returns a parameter mapping that maps from the names of MLC LLM parameters to
the names of AWQ parameters.
Parameters
----------
model_config : CohereConfig
The configuration of the Cohere model.

quantization : Quantization
The quantization configuration.

Returns
-------
param_map : ExternMapping
The parameter mapping from MLC to AWQ.
"""
model, _ = awq_quant(model_config, quantization)
_, _named_params, _ = model.export_tvm( # type: ignore[misc]
spec=model.get_default_spec(), # type: ignore[attr-defined]
allow_extern=True,
)
named_parameters = dict(_named_params)

mapping = ExternMapping()

def _add(mlc_name, hf_name):
mapping.add_mapping(
mlc_name,
[hf_name],
functools.partial(
lambda x, dtype: x.astype(dtype),
dtype=named_parameters[mlc_name].dtype,
),
)

for i in range(model_config.num_hidden_layers):
# Add QKV in self attention
attn = f"model.layers.{i}.self_attn"
for quantize_suffix in ["qweight", "qzeros", "scales"]:
mlc_name = f"{attn}.qkv_proj.{quantize_suffix}"
assert mlc_name in named_parameters
mlc_param = named_parameters[mlc_name]
mapping.add_mapping(
mlc_name,
[
f"{attn}.q_proj.{quantize_suffix}",
f"{attn}.k_proj.{quantize_suffix}",
f"{attn}.v_proj.{quantize_suffix}",
],
functools.partial(
lambda q, k, v, dtype: np.concatenate(
[q, k, v],
axis=1, # AWQ GEMM would transpose the weight
).astype(dtype),
dtype=mlc_param.dtype,
),
)
_add(f"{attn}.out_proj.{quantize_suffix}", f"{attn}.o_proj.{quantize_suffix}")

# Concat gate and up in MLP
mlp = f"model.layers.{i}.mlp"
for quantize_suffix in ["qweight", "qzeros", "scales"]:
_add(f"{mlp}.up_proj.{quantize_suffix}", f"{mlp}.up_proj.{quantize_suffix}")
_add(f"{mlp}.gate_proj.{quantize_suffix}", f"{mlp}.gate_proj.{quantize_suffix}")
_add(f"{mlp}.down_proj.{quantize_suffix}", f"{mlp}.down_proj.{quantize_suffix}")

# inv_freq is not used in the model
# mapping.add_unused(f"{attn}.rotary_emb.inv_freq")

for mlc_name, mlc_param in named_parameters.items():
if mlc_name not in mapping.param_map:
mapping.add_mapping(
mlc_name,
[mlc_name],
functools.partial(lambda x, dtype: x.astype(dtype), dtype=mlc_param.dtype),
)
return mapping
Loading