Skip to content

Commit

Permalink
[QNN EP] Update to QNN SDK 2.24.0 (#21463)
Browse files Browse the repository at this point in the history
### Description
- Update pipelines to use QNN SDK 2.24 by default
- Update QNN_Nuget_Windows pipeline to build csharp solution without
mobile projects (fixes errors).
- Implement workaround for QNN 2.24 validation bug for LayerNorm ops
without an explicit bias input.
- Enable Relu unit test, which now passes due to the fact Relu is no
longer fused into QuantizeLinear for QNN EP.
- Fix bug where a negative quantization axis is not properly normalized
for per-channel int4 conv.



### Motivation and Context
Update QNN SDk.
  • Loading branch information
adrianlizarraga authored Jul 24, 2024
1 parent b04adcc commit eb9b377
Show file tree
Hide file tree
Showing 21 changed files with 339 additions and 32 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#include <cassert>
#include "core/providers/common.h"
#include "core/providers/shared/utils/utils.h"
#include "core/framework/tensorprotoutils.h"
#include "core/providers/qnn/builder/qnn_utils.h"
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
#include "core/providers/qnn/builder/op_builder_factory.h"
#include "core/common/safeint.h"
Expand All @@ -24,6 +26,11 @@ class LayerNormOpBuilder : public BaseOpBuilder {
const logging::Logger& logger) const override final ORT_MUST_USE_RESULT;

protected:
Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const override ORT_MUST_USE_RESULT;
Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
Expand Down Expand Up @@ -55,6 +62,91 @@ Status LayerNormOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
}

Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
const logging::Logger& logger,
std::vector<std::string>& input_names,
bool do_op_validation) const {
ORT_UNUSED_PARAMETER(do_op_validation);

const auto& inputs = node_unit.Inputs();
const auto input_count = inputs.size();
constexpr size_t X_IDX = 0;
constexpr size_t SCALE_IDX = 1;
constexpr size_t BIAS_IDX = 2;

// Input[0] (X, required)
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[X_IDX], logger, input_names));

// Input[1] (scale, required)
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[SCALE_IDX], logger, input_names));

// Input[2] (bias, optional)
const bool has_bias_input = input_count > BIAS_IDX && inputs[BIAS_IDX].node_arg.Exists();
if (has_bias_input) {
ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[BIAS_IDX], logger, input_names));
}

#if QNN_API_VERSION_MAJOR == 2 && QNN_API_VERSION_MINOR == 17
if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) {
// Bias is implicit. QNN SDK 2.24 (QNN API version 2.17) has a validation bug for implicit bias inputs, so provide
// an explicit bias of all 0 (quantized int32).
TensorInfo x_input_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[X_IDX], x_input_info));

TensorInfo scale_input_info = {};
ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[SCALE_IDX], scale_input_info));

if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) {
const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep";

// Make dummy bias input have the same shape as the scale input.
std::vector<uint32_t> bias_shape = scale_input_info.shape;
size_t num_bias_elems = 1;
for (size_t i = 0; i < bias_shape.size(); i++) {
num_bias_elems *= static_cast<size_t>(bias_shape[i]);
}

// Bias static input should be all zeros.
std::vector<uint8_t> bias_bytes(num_bias_elems * sizeof(int32_t), 0);

// Bias's quantization scale should be the product of the other inputs' quantization scales.
std::vector<float> input0_quant_scales;
std::vector<float> input1_quant_scales;
ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales));
ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales));

const size_t num_bias_scales_offsets = input1_quant_scales.size();
assert(input0_quant_scales.size() == 1); // Expected for per-tensor.
ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(),
"Input[1] should have >= 1 quantization scale values");

std::vector<float> bias_scales(num_bias_scales_offsets);
for (size_t i = 0; i < num_bias_scales_offsets; i++) {
bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i];
}

std::vector<int32_t> bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros.
QnnQuantParamsWrapper bias_qparams;

if (scale_input_info.quant_param.IsPerChannel()) {
bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false);
} else {
bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]);
}

auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32,
std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes));

qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper));
input_names.push_back(bias_name);
}
}
#endif

return Status::OK();
}

Status LayerNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
Expand Down
10 changes: 10 additions & 0 deletions onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,16 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&

if (is_per_channel) {
axis = io_def.quant_param->axis.value_or(1); // 1 is default axis for Q/DQ ops.
if (axis < 0) {
// Normalize negative axis by adding rank.
const auto* tensor_shape_proto = io_def.node_arg.Shape();
ORT_RETURN_IF_NOT(tensor_shape_proto != nullptr, "NULL tensor shape proto");

const int rank = tensor_shape_proto->dim_size();
ORT_RETURN_IF_NOT(rank > 0, "Per-channel quantized tensor should be of rank > 0");

axis += rank;
}
}

return Status::OK();
Expand Down
107 changes: 106 additions & 1 deletion onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,118 @@ QnnQuantParamsWrapper& QnnQuantParamsWrapper::operator=(const QnnQuantParamsWrap
return *this;
}

// Construct per-tensor quantization params.
QnnQuantParamsWrapper::QnnQuantParamsWrapper(float scale, int32_t offset) {
params_.encodingDefinition = QNN_DEFINITION_DEFINED;
params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET;
params_.scaleOffsetEncoding.scale = scale;
params_.scaleOffsetEncoding.offset = offset;
}

// Construct a per-channel quantization param.
QnnQuantParamsWrapper::QnnQuantParamsWrapper(gsl::span<const float> scales, gsl::span<const int32_t> offsets,
int32_t axis, bool is_int4) {
assert(scales.size() == offsets.size()); // Logic error if sizes don't match.
const uint32_t num_elems = static_cast<uint32_t>(scales.size());
params_.encodingDefinition = QNN_DEFINITION_DEFINED;

if (is_int4) {
params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET;
params_.bwAxisScaleOffsetEncoding.numElements = num_elems;
params_.bwAxisScaleOffsetEncoding.axis = axis;
params_.bwAxisScaleOffsetEncoding.bitwidth = 4;

// Deep copy to the scales[] and offsets[] arrays
if (num_elems > 0) {
const size_t num_scale_bytes = num_elems * sizeof(float);
const size_t num_zp_bytes = num_elems * sizeof(int32_t);
const size_t num_bytes = num_scale_bytes + num_zp_bytes;
constexpr std::uintptr_t align = alignof(float);
static_assert(alignof(float) == alignof(int32_t));

per_channel_data_ = std::make_unique<char[]>(num_bytes + align);
char* scales_begin = ALIGN_PTR_UP(per_channel_data_.get(), align, char*);
char* zps_begin = scales_begin + num_scale_bytes;

std::memcpy(scales_begin, scales.data(), num_scale_bytes);
std::memcpy(zps_begin, offsets.data(), num_zp_bytes);
params_.bwAxisScaleOffsetEncoding.scales = reinterpret_cast<float*>(scales_begin);
params_.bwAxisScaleOffsetEncoding.offsets = reinterpret_cast<int32_t*>(zps_begin);
} else {
params_.bwAxisScaleOffsetEncoding.scales = nullptr;
params_.bwAxisScaleOffsetEncoding.offsets = nullptr;
}
} else {
params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET;
params_.axisScaleOffsetEncoding.numScaleOffsets = num_elems;
params_.axisScaleOffsetEncoding.axis = axis;

// Deep copy to the scaleOffset data.
if (num_elems > 0) {
const size_t num_bytes = num_elems * sizeof(Qnn_ScaleOffset_t);
constexpr std::uintptr_t align = alignof(Qnn_ScaleOffset_t);
per_channel_data_ = std::make_unique<char[]>(num_bytes + align);
Qnn_ScaleOffset_t* aligned_dst = ALIGN_PTR_UP(per_channel_data_.get(), align, Qnn_ScaleOffset_t*);

for (size_t i = 0; i < static_cast<uint32_t>(num_elems); i++) {
aligned_dst[i].offset = offsets[i];
aligned_dst[i].scale = scales[i];
}

params_.axisScaleOffsetEncoding.scaleOffset = aligned_dst;
} else {
params_.axisScaleOffsetEncoding.scaleOffset = nullptr;
}
}
}

// Get a copy of scales. Works for both per-tensor and per-channel.
Status QnnQuantParamsWrapper::GetScales(/*out*/ std::vector<float>& scales) const {
ORT_RETURN_IF_NOT(params_.encodingDefinition == QNN_DEFINITION_DEFINED, "Unquantized qparams does not have scales");

switch (params_.quantizationEncoding) {
case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET:
scales.resize(1);
scales[0] = params_.scaleOffsetEncoding.scale;
break;
case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET:
scales.resize(1);
scales[0] = params_.bwScaleOffsetEncoding.scale;
break;
case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: {
const uint32_t num_elems = params_.axisScaleOffsetEncoding.numScaleOffsets;
scales.resize(num_elems);

if (num_elems > 0) {
gsl::span<const Qnn_ScaleOffset_t> scale_offsets(params_.axisScaleOffsetEncoding.scaleOffset, num_elems);

for (size_t i = 0; i < num_elems; i++) {
scales[i] = scale_offsets[i].scale;
}
}
break;
}
case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: {
const uint32_t num_elems = params_.bwAxisScaleOffsetEncoding.numElements;
scales.resize(num_elems);

// Deep copy the scales[] and offsets[] arrays
if (num_elems > 0) {
gsl::span<const float> src_scales(params_.bwAxisScaleOffsetEncoding.scales, num_elems);
for (size_t i = 0; i < num_elems; i++) {
scales[i] = src_scales[i];
}
}
break;
}
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported QNN quantization encoding: ",
params_.quantizationEncoding);
}

return Status::OK();
}

QnnQuantParamsWrapper QnnQuantParamsWrapper::Copy() const {
return QnnQuantParamsWrapper(*this);
}
Expand Down Expand Up @@ -199,7 +304,7 @@ Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper, con

params_.encodingDefinition = QNN_DEFINITION_DEFINED;
params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET;
params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(*(ort_quant_params->axis));
params_.bwAxisScaleOffsetEncoding.axis = static_cast<int32_t>(axis);
params_.bwAxisScaleOffsetEncoding.bitwidth = 4;
params_.bwAxisScaleOffsetEncoding.numElements = static_cast<uint32_t>(num_elems);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#pragma once
#include <memory>
#include <vector>
#include "QnnTypes.h"
#include "core/common/common.h"
#include <gsl/gsl>
Expand All @@ -26,6 +27,9 @@ class QnnQuantParamsWrapper {
// Construct a per-tensor quantization param (SCALE_OFFSET)
QnnQuantParamsWrapper(float scale, int32_t offset);

// Construct a per-channel quantization param.
QnnQuantParamsWrapper(gsl::span<const float> scales, gsl::span<const int32_t> offsets, int32_t axis, bool is_int4);

Qnn_QuantizeParams_t& Get() { return params_; }
const Qnn_QuantizeParams_t& Get() const { return params_; }

Expand Down Expand Up @@ -54,6 +58,9 @@ class QnnQuantParamsWrapper {
(params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET));
}

// Get a copy of scales. Works for both per-tensor and per-channel.
Status GetScales(/*out*/ std::vector<float>& scales) const;

// Handle transposing of a per-channel quantized tensor. The quantization parameter's axis
// must be transposed using the inverse permutation of the Transpose.
template <typename IntType>
Expand Down
38 changes: 35 additions & 3 deletions onnxruntime/test/providers/qnn/conv_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,18 +178,22 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData());
std::vector<float> weight_scales;
std::vector<WeightQType> weight_zero_points;
TensorShape weights_shape = weights_def.GetTensorShape();
int64_t pos_weight_quant_axis = weight_quant_axis;
if (pos_weight_quant_axis < 0) {
pos_weight_quant_axis += static_cast<int64_t>(weights_shape.NumDimensions());
}
GetTestInputQuantParamsPerChannel<WeightQType>(weights_def, weight_scales, weight_zero_points,
static_cast<size_t>(weight_quant_axis), true);
static_cast<size_t>(pos_weight_quant_axis), true);

TensorShape weights_shape = weights_def.GetTensorShape();
std::vector<WeightQType> quantized_weights;
size_t num_weight_storage_elems = weights_shape.Size();
if constexpr (std::is_same_v<WeightQType, Int4x2> || std::is_same_v<WeightQType, UInt4x2>) {
num_weight_storage_elems = Int4x2::CalcNumInt4Pairs(weights_shape.Size());
}
quantized_weights.resize(num_weight_storage_elems);
QuantizeValues<float, WeightQType>(weights_def.GetRawData(), quantized_weights, weights_shape,
weight_scales, weight_zero_points, weight_quant_axis);
weight_scales, weight_zero_points, pos_weight_quant_axis);

NodeArg* weights_initializer = builder.MakeInitializer<WeightQType>(weights_def.GetShape(), quantized_weights);
NodeArg* weights_dq = builder.MakeIntermediate();
Expand Down Expand Up @@ -760,6 +764,34 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) {
21); // opset
}

// Test per-channel QDQ Conv with INT4 weights and a negative weight quantization axis that still points to dimension 0.
TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_NegativeWeightQuantAxis) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};
std::vector<int64_t> bias_shape = {3};

TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
TestInputDef<float> bias_def(bias_shape, true,
GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));

RunHTPConvOpPerChannelTest<uint8_t, Int4x2>("Conv",
input_def,
weight_def,
bias_def,
-4, // negative weight quant axis (same as 0)
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
}

// Test per-channel QDQ Conv with INT4 weights. in0: u16, in1 (weight): s4, in2 (bias): s32, out: u8
// TODO(adrianlizarraga): Investigate inaccuracy for QNN EP.
//
Expand Down
Loading

0 comments on commit eb9b377

Please sign in to comment.