Skip to content

Commit

Permalink
embed int8 quantization and add embed test (#5667)
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Sep 2, 2024
1 parent 5e2d56d commit 5df5413
Show file tree
Hide file tree
Showing 8 changed files with 261 additions and 9 deletions.
2 changes: 2 additions & 0 deletions .ci/pnnx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ on:
branches: [master]
paths:
- '.ci/pnnx.yml'
- 'src/layer/*'
- 'tools/pnnx/**'
- '!tools/pnnx/README.md'
mr:
target-branches: [master]
paths:
- '.ci/pnnx.yml'
- 'src/layer/*'
- 'tools/pnnx/**'
- '!tools/pnnx/README.md'
concurrency:
Expand Down
2 changes: 2 additions & 0 deletions docs/developer-guide/operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -837,11 +837,13 @@ y = embedding(x)
| 1 | input_dim | int | 0 | |
| 2 | bias_term | int | 0 | |
| 3 | weight_data_size | int | 0 | |
| 18 | int8_scale_term| int | 0 | |

| weight | type | shape |
| ------------- | ----- | --------------------- |
| weight_data | float | [weight_data_size] |
| bias_term | float | [num_output] |
| weight_data_int8_scales| float | [1] |

# Exp
```
Expand Down
88 changes: 79 additions & 9 deletions src/layer/embed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ int Embed::load_param(const ParamDict& pd)
input_dim = pd.get(1, 0);
bias_term = pd.get(2, 0);
weight_data_size = pd.get(3, 0);
int8_scale_term = pd.get(18, 0);

return 0;
}
Expand All @@ -47,18 +48,23 @@ int Embed::load_model(const ModelBin& mb)
return -100;
}

#if NCNN_INT8
if (int8_scale_term)
{
weight_data_int8_scale = mb.load(1, 1)[0];
}
#endif // NCNN_INT8

return 0;
}

int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
{
int words = static_cast<int>(bottom_blob.total());
const int num_output = top_blob.w;
const int words = top_blob.h;

top_blob.create(num_output, words, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;
const float* bias_ptr = bias_data;

// num_output
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < words; q++)
{
Expand All @@ -73,15 +79,79 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) con

const float* em = (const float*)weight_data + num_output * word_index;

memcpy(outptr, em, num_output * sizeof(float));
if (bias_ptr)
{
for (int p = 0; p < num_output; p++)
{
outptr[p] = em[p] + bias_ptr[p];
}
}
else
{
memcpy(outptr, em, num_output * sizeof(float));
}
}
}

#if NCNN_INT8
static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
{
const int num_output = top_blob.w;
const int words = top_blob.h;

const float* bias_ptr = bias_data;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < words; q++)
{
float* outptr = top_blob.row(q);

int word_index = ((const int*)bottom_blob)[q];

if (bias_term)
if (word_index < 0)
word_index = 0;
if (word_index >= input_dim)
word_index = input_dim - 1;

const float descale_em = 1.f / weight_data_int8_scale;

const signed char* em = (const signed char*)weight_data + num_output * word_index;

if (bias_ptr)
{
for (int p = 0; p < num_output; p++)
{
outptr[p] += bias_data[p];
outptr[p] = em[p] * descale_em + bias_ptr[p];
}
}
else
{
for (int p = 0; p < num_output; p++)
{
outptr[p] = em[p] * descale_em;
}
}
}
}
#endif // NCNN_INT8

int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
int words = static_cast<int>(bottom_blob.total());

top_blob.create(num_output, words, 4u, opt.blob_allocator);
if (top_blob.empty())
return -100;

#if NCNN_INT8
if (int8_scale_term)
{
embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt);
}
else
#endif // NCNN_INT8
{
embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt);
}

return 0;
Expand Down
6 changes: 6 additions & 0 deletions src/layer/embed.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,15 @@ class Embed : public Layer

int weight_data_size;

int int8_scale_term;

// model
Mat weight_data;
Mat bias_data;

#if NCNN_INT8
float weight_data_int8_scale;
#endif
};

} // namespace ncnn
Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ ncnn_add_layer_test(Dropout)
ncnn_add_layer_test(Einsum)
ncnn_add_layer_test(Eltwise)
ncnn_add_layer_test(ELU)
ncnn_add_layer_test(Embed)
ncnn_add_layer_test(Erf)
ncnn_add_layer_test(ExpandDims)
ncnn_add_layer_test(Flatten)
Expand Down
108 changes: 108 additions & 0 deletions tests/test_embed.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "testutil.h"

static int test_embed(int words, int num_output, int input_dim, int bias)
{
ncnn::ParamDict pd;
pd.set(0, num_output);
pd.set(1, input_dim);
pd.set(2, bias);
pd.set(3, num_output * input_dim);

std::vector<ncnn::Mat> weights(bias ? 2 : 1);
weights[0] = RandomMat(num_output * input_dim);
if (bias)
weights[1] = RandomMat(num_output);

ncnn::Mat a(words);
RandomizeInt(a, 0, input_dim);

int ret = test_layer("Embed", pd, weights, a);
if (ret != 0)
{
fprintf(stderr, "test_embed failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
}

return ret;
}

static int test_embed_0()
{
return 0
|| test_embed(128, 128, 128, 0)
|| test_embed(128, 128, 128, 1)
|| test_embed(127, 127, 127, 0)
|| test_embed(127, 127, 127, 1)
|| test_embed(124, 124, 124, 0)
|| test_embed(124, 124, 124, 1);
}

#if NCNN_INT8
static int test_embed_int8(int words, int num_output, int input_dim, int bias)
{
ncnn::ParamDict pd;
pd.set(0, num_output);
pd.set(1, input_dim);
pd.set(2, bias);
pd.set(3, num_output * input_dim);
pd.set(18, 2);

std::vector<ncnn::Mat> weights(bias ? 3 : 2);
weights[0] = RandomS8Mat(num_output * input_dim);
if (bias)
{
weights[1] = RandomMat(num_output);
weights[2] = RandomMat(1, 100.f, 200.f);
}
else
{
weights[1] = RandomMat(1, 100.f, 200.f);
}

ncnn::Mat a(words);
RandomizeInt(a, 0, input_dim);

int ret = test_layer("Embed", pd, weights, a);
if (ret != 0)
{
fprintf(stderr, "test_embed_int8 failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
}

return ret;
}

static int test_embed_1()
{
return 0
|| test_embed_int8(128, 128, 128, 0)
|| test_embed_int8(128, 128, 128, 1)
|| test_embed_int8(127, 127, 127, 0)
|| test_embed_int8(127, 127, 127, 1)
|| test_embed_int8(124, 124, 124, 0)
|| test_embed_int8(124, 124, 124, 1);
}
#endif // NCNN_INT8

int main()
{
SRAND(7767517);

#if NCNN_INT8
return test_embed_0() || test_embed_1();
#else
return test_embed_0();
#endif
}
11 changes: 11 additions & 0 deletions tools/modelwriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -1676,9 +1676,20 @@ int ModelWriter::save(const char* parampath, const char* binpath)
fprintf_param_value(" 1=%d", input_dim)
fprintf_param_value(" 2=%d", bias_term)
fprintf_param_value(" 3=%d", weight_data_size)
fprintf_param_value(" 18=%d", int8_scale_term)

fwrite_weight_tag_data(op->weight_data, bp);
fwrite_weight_data(op->bias_data, bp);

#if NCNN_INT8
// write int8_scale data
if (op->int8_scale_term)
{
ncnn::Mat weight_data_int8_scales(1);
weight_data_int8_scales[0] = op->weight_data_int8_scale;
fwrite_weight_data(weight_data_int8_scales, bp, 90, 100);
}
#endif // NCNN_INT8
}
else if (layer->type == "Exp")
{
Expand Down
52 changes: 52 additions & 0 deletions tools/quantize/ncnn2int8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ class NetQuantize : public ModelWriter
int quantize_lstm();
int quantize_gru();

int quantize_embed();

int fuse_requantize();
};

Expand Down Expand Up @@ -562,6 +564,55 @@ int NetQuantize::quantize_gru()
return 0;
}

int NetQuantize::quantize_embed()
{
for (size_t i = 0; i < layers.size(); i++)
{
if (layers[i]->type != "Embed")
continue;

// Embed - quantize weight from fp32 to int8
ncnn::Embed* embed = (ncnn::Embed*)layers[i];

fprintf(stderr, "quantize_embed %s\n", embed->name.c_str());

// TODO move to ncnn2table

const int num_output = embed->num_output;
const int input_dim = embed->input_dim;

ncnn::Mat weight_data_int8_scales(1);
{
const float* ptr = embed->weight_data;
float absmax = 0.f;
for (int i = 0; i < embed->weight_data.w; i++)
{
absmax = std::max(absmax, (float)fabs(ptr[i]));
}

weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax;
}

{
ncnn::Mat weight_data_int8;

ncnn::Option opt_q = opt;
opt_q.blob_allocator = embed->weight_data.allocator;
opt_q.use_packing_layout = false;
ncnn::quantize_to_int8(embed->weight_data, weight_data_int8, weight_data_int8_scales, opt_q);
if (weight_data_int8.empty())
return -100;

embed->weight_data = weight_data_int8;
}

embed->int8_scale_term = 2;
embed->weight_data_int8_scale = weight_data_int8_scales[0];
}

return 0;
}

int NetQuantize::fuse_requantize()
{
const size_t layer_count = layers.size();
Expand Down Expand Up @@ -809,6 +860,7 @@ int main(int argc, char** argv)
quantizer.quantize_rnn();
quantizer.quantize_lstm();
quantizer.quantize_gru();
quantizer.quantize_embed();

quantizer.fuse_requantize();

Expand Down

0 comments on commit 5df5413

Please sign in to comment.