Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

qbits deprecate clip postfix #1672

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/qbits.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import intel_extension_for_transformers.qbits as qbits
transpose (bool): Whether to transpose the weight tensor (required for quantize_to_packed_weight with KxN weight shape).
blocksize (int): Blocksize for weight-only quantization.
compute_type (str): Computation type (fp32/bf16/int8). fp32 will leverage AVX2/AVX512F to compute, bf16 will be AMX_BF16, int8 will be VNNI/AMX_INT8.
weight_type (str): Quantization type (int8/int4_clip/int4_fullrange/nf4/fp4_e2m1).
weight_type (str): Quantization type (int8/int4/int3/int2/nf4/fp4_e2m1).
scale_type (str): Scale type (fp32/bf16).
asym (bool): Whether to use asymmetric quantization.

Expand All @@ -37,7 +37,7 @@ pack_weight = qbits.quantize_to_packed_weight(
g_idx (torch.Tensor): shuffle index used by GPTQ, dtype must be int32.
blocksize (int): Blocksize for weight-only quantization.
compute_type (str): Computation type (fp32/bf16/int8). fp32 will leverage AVX2/AVX512F to compute, bf16 will be AMX_BF16, int8 will be VNNI/AMX_INT8.
weight_type (str): Quantization type (int8/int4_clip/int4_fullrange/nf4/fp4_e2m1).
weight_type (str): Quantization type (int8/int4/int3/int2/nf4/fp4_e2m1).
scale_type (str): Scale type (fp32/bf16).
asym (bool): Whether to use asymmetric quantization.

Expand All @@ -57,7 +57,7 @@ pack_weight = qbits.repack_quantized_weight(
bias (torch.Tensor): Bias tensor, must be fp32, if bias is empty woq_linear will not add bias.
output (torch.Tensor): Output tensor, support fp32/bf16, shape must be MxN.
compute_type (str): Computation type (fp32/bf16/int8).fp32 will leverage AVX2/AVX512F to compute, bf16 will leverage AMX_BF16 to compute, int8 will leverage VNNI/AMX_INT8 to compute.
weight_type (str): Quantization type (int8/int4_clip/int4_fullrange/nf4/fp4_e2m1).
weight_type (str): Quantization type (int8/int4/int3/int2/nf4/fp4_e2m1).
scale_type (str): Scale type (fp32/bf16).
asym (bool): Whether to use asymmetric quantization.
"""
Expand Down
2 changes: 1 addition & 1 deletion examples/vllm/vllm_acceleration_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
config = RtnConfig(compute_dtype="int8",
group_size=128,
scale_dtype="bf16",
weight_dtype="int4_clip",
weight_dtype="int4",
bits=4)
print(config)
prompts = [args.prompt]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,10 @@ struct woq_runtime_ctx {
bestla::storage::gemm::IWeightBase* deseries_wei;
};

static std::map<std::string, BTLA_DTYPE> wei2bestladt_map{{"int8", BTLA_DTYPE::S8},
{"int4_clip", BTLA_DTYPE::S4_CLIP},
{"int3_clip", BTLA_DTYPE::S3_CLIP},
{"int2_clip", BTLA_DTYPE::S2_CLIP},
{"nf4", BTLA_DTYPE::F4_NF4},
{"fp4_e2m1_bnb", BTLA_DTYPE::F4_BNB},
{"fp4_e2m1", BTLA_DTYPE::F4_E2M1},
{"fp8_e4m3", BTLA_DTYPE::F8_E4M3},
{"fp8_e5m2", BTLA_DTYPE::F8_E5M2}};
static std::map<std::string, BTLA_DTYPE> wei2bestladt_map{
{"int8", BTLA_DTYPE::S8}, {"int4", BTLA_DTYPE::S4_CLIP}, {"int3", BTLA_DTYPE::S3_CLIP},
{"int2", BTLA_DTYPE::S2_CLIP}, {"nf4", BTLA_DTYPE::F4_NF4}, {"fp4_e2m1_bnb", BTLA_DTYPE::F4_BNB},
{"fp4_e2m1", BTLA_DTYPE::F4_E2M1}, {"fp8_e4m3", BTLA_DTYPE::F8_E4M3}, {"fp8_e5m2", BTLA_DTYPE::F8_E5M2}};
static std::map<std::string, BTLA_DTYPE> scale2bestladt_map{
{"fp32", BTLA_DTYPE::F32}, {"bf16", BTLA_DTYPE::BF16}, {"fp8_e8m0", BTLA_DTYPE::F8_E8M0}};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ void execute_qpack(repack_quantized_weight_param* p, repack_quantized_weight_ctx

template <class GemmCore, BTLA_ISA ISA>
void parse_prob(repack_quantized_weight_param* p, repack_quantized_weight_ctx* ctx, WOQ_TASK task) {
if (p->weight_type == "int8" || p->weight_type == "int4_clip" || p->weight_type == "int3_clip" ||
p->weight_type == "int2_clip") {
if (p->weight_type == "int8" || p->weight_type == "int4" || p->weight_type == "int3" || p->weight_type == "int2") {
return execute_qpack<bestla::prologue_b::gemm::WeightKBlockNInteger<GemmCore, ISA>>(p, ctx, task);
}
if (p->weight_type == "nf4" || p->weight_type == "fp4_e2m1_bnb" || p->weight_type == "fp4_e2m1") {
Expand All @@ -61,11 +60,11 @@ std::string get_dtype_str(BTLA_DTYPE dtype) {
case BTLA_DTYPE::BF16:
return "bf16";
case BTLA_DTYPE::S4_CLIP:
return "int4_clip";
return "int4";
case BTLA_DTYPE::S3_CLIP:
return "int3_clip";
return "int3";
case BTLA_DTYPE::S2_CLIP:
return "int2_clip";
return "int2";
case BTLA_DTYPE::F4_NF4:
return "nf4";
case BTLA_DTYPE::F4_E2M1:
Expand Down Expand Up @@ -205,9 +204,9 @@ torch::Tensor get_packw_info(torch::Tensor& packw, PACKW_ACQUIRE_TYPE ACQ_T) {

void bestla_packq(repack_quantized_weight_param* p, repack_quantized_weight_ctx* ctx, WOQ_TASK task) {
if (p->compute_type == "int8") {
TORCH_CHECK(p->weight_type == "int8" || p->weight_type == "int4_clip" || p->weight_type == "int3_clip" ||
p->weight_type == "int2_clip",
"Qbits: only support Integer weight-type with int8 compute-type");
TORCH_CHECK(
p->weight_type == "int8" || p->weight_type == "int4" || p->weight_type == "int3" || p->weight_type == "int2",
"Qbits: only support Integer weight-type with int8 compute-type");
if (dispatcher_utils::check_amx() && p->blocksize % bestla::gemm::ICoreRowNAmxint8KBlock<64, 16>::KTILE == 0) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please be compatible with int4_clip and int4_fullrange

return parse_prob<bestla::gemm::ICoreRowNAmxint8KBlock<64, 16>, BTLA_ISA::AMX_INT8>(p, ctx, task);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,7 @@ void parse_activation(woq_config_param* p, woq_runtime_ctx* ctx) {
template <WOQ_TASK TASK, class GemmCore>
void parse_weight(woq_config_param* p, woq_runtime_ctx* ctx) {
using namespace bestla::prologue_b::gemm;
if (p->weight_type == "int8" || p->weight_type == "int4_clip" || p->weight_type == "int3_clip" ||
p->weight_type == "int2_clip") {
if (p->weight_type == "int8" || p->weight_type == "int4" || p->weight_type == "int3" || p->weight_type == "int2") {
return parse_activation<TASK, GemmCore, WeightKBlockNInteger>(p, ctx);
}
if (p->weight_type == "nf4" || p->weight_type == "fp4_e2m1_bnb" || p->weight_type == "fp4_e2m1" ||
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class acquire_type(Enum):
@pytest.mark.parametrize("k", [512])
@pytest.mark.parametrize("blocksize", [128])
@pytest.mark.parametrize("compute_type", ["fp32", "bf16", "int8"])
@pytest.mark.parametrize("weight_type", ["int8", "int4_clip"])
@pytest.mark.parametrize("weight_type", ["int8", "int4"])
@pytest.mark.parametrize("scale_type", ["fp32"])
@pytest.mark.parametrize("asym", [True, False])
def test(m, k, n, weight_type, scale_type, compute_type, asym, blocksize, dump_tensor=False):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@

from ut_utils import *

cmpt_configs = {"int8": {"int8", "bf16", "fp32"}, "int4_clip": {"int8", "fp32", "bf16"}, "int3_clip": {"int8", "fp32", "bf16"}, "int2_clip": {"int8", "fp32", "bf16"}, "fp4_e2m1_bnb": {"fp32", "bf16"}, "fp4_e2m1": {"fp32", "bf16"}, "nf4": {"fp32", "bf16"},
cmpt_configs = {"int8": {"int8", "bf16", "fp32"}, "int4": {"int8", "fp32", "bf16"}, "int3": {"int8", "fp32", "bf16"}, "int2": {"int8", "fp32", "bf16"}, "fp4_e2m1_bnb": {"fp32", "bf16"}, "fp4_e2m1": {"fp32", "bf16"}, "nf4": {"fp32", "bf16"},
"fp8_e5m2": {"fp32", "bf16"}, "fp8_e4m3": {"fp32", "bf16"}
}

scale_configs = {"int8": {"fp32", "bf16"}, "int4_clip": {"fp32", "bf16"}, "int3_clip": {"fp32", "bf16"}, "int2_clip": {"fp32", "bf16"}, "fp4_e2m1_bnb": {"fp32", "bf16"}, "fp4_e2m1": {"fp32", "bf16"}, "nf4": {"fp32", "bf16"},
scale_configs = {"int8": {"fp32", "bf16"}, "int4": {"fp32", "bf16"}, "int3": {"fp32", "bf16"}, "int2": {"fp32", "bf16"}, "fp4_e2m1_bnb": {"fp32", "bf16"}, "fp4_e2m1": {"fp32", "bf16"}, "nf4": {"fp32", "bf16"},
"fp8_e5m2": {"fp32", "fp8_e8m0"}, "fp8_e4m3": {"fp32", "fp8_e8m0"}}

asym_configs = {"int8", "int4_clip", "int3_clip", "int2_clip"}
asym_configs = {"int8", "int4", "int3", "int2"}


@capture_args
Expand All @@ -33,7 +33,7 @@
@pytest.mark.parametrize("k", [512])
@pytest.mark.parametrize("blocksize", [128, -1])
@pytest.mark.parametrize("compute_type", ["int8", "bf16", "fp32"])
@pytest.mark.parametrize("weight_type", ["int8", "int4_clip", "int3_clip", "int2_clip", "nf4", "fp4_e2m1_bnb", "fp4_e2m1", "fp8_e5m2", "fp8_e4m3"])
@pytest.mark.parametrize("weight_type", ["int8", "int4", "int3", "int2", "nf4", "fp4_e2m1_bnb", "fp4_e2m1", "fp8_e5m2", "fp8_e4m3"])
@pytest.mark.parametrize("scale_type", ["fp32", "bf16", "fp8_e8m0"])
@pytest.mark.parametrize("asym", [True, False])
@pytest.mark.parametrize("transpose", [True, False])
Expand Down
Loading