-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* layer_type support List[str] * add mllama support * check layer not MllamaCrossAttentionDecoderLayer * TODO need image dataset for vision quantization * Update mllama.py * comment on mllama repeating 4 layer group structure --------- Co-authored-by: LRL-ModelCloud <lrl@modelcloud.ai> Co-authored-by: Qubitium-ModelCloud <qubitium@modelcloud.ai>
- Loading branch information
1 parent
4b9506f
commit 4921d68
Showing
5 changed files
with
34 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
from .base import BaseGPTQModel | ||
|
||
# TODO FIXME: we currently do not support quantizing cross attention layer (pixel_values) | ||
class MLlamaGPTQ(BaseGPTQModel): | ||
# Non-repeating layers at the root level: same level as `layers_node` | ||
# Excluding `layers_node`. | ||
base_modules = ["language_model.model.embed_tokens", "language_model.model.norm"] | ||
|
||
# Below describes all the repeating layers in this transformer model | ||
# `model.layers` is a node/module that hold all the repeating layers. The parent node for all n-layers. | ||
layers_node = "language_model.model.layers" | ||
# MLllama has two types of repeating layers. Repeats in groups of 4 layers: 0-2 (first 3 layers) is text layers, 3 (4th) is cross-attention layer for vision | ||
layer_type = ["MllamaSelfAttentionDecoderLayer", "MllamaCrossAttentionDecoderLayer"] | ||
# Inside each `LlamaDecoderLayer` layer are many internal modules | ||
# List them in the order executed in model forward() code | ||
# Many models have same execution order of: attention (q_k_v) projection, attention (output) projection, mlp (n) projections | ||
layer_modules = [ | ||
["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"], | ||
["self_attn.o_proj"], | ||
["mlp.up_proj", "mlp.gate_proj"], | ||
["mlp.down_proj"], | ||
] |