embed int8 quantization and add embed test (#5667)

Tencent · Sep 2, 2024 · 5df5413 · 5df5413
1 parent 5e2d56d
commit 5df5413
Show file tree

Hide file tree

Showing 8 changed files with 261 additions and 9 deletions.
diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml
@@ -4,12 +4,14 @@ on:
  branches: [master]
  paths:
  - '.ci/pnnx.yml'
+ - 'src/layer/*'
  - 'tools/pnnx/**'
  - '!tools/pnnx/README.md'
  mr:
  target-branches: [master]
  paths:
  - '.ci/pnnx.yml'
+ - 'src/layer/*'
  - 'tools/pnnx/**'
  - '!tools/pnnx/README.md'
 concurrency:

diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
@@ -837,11 +837,13 @@ y = embedding(x)
 | 1 | input_dim | int | 0 | |
 | 2 | bias_term | int | 0 | |
 | 3 | weight_data_size | int | 0 | |
+| 18 | int8_scale_term| int | 0 | |
 
 | weight | type | shape |
 | ------------- | ----- | --------------------- |
 | weight_data | float | [weight_data_size] |
 | bias_term | float | [num_output] |
+| weight_data_int8_scales| float | [1] |
 
 # Exp
 ```

diff --git a/src/layer/embed.cpp b/src/layer/embed.cpp
@@ -30,6 +30,7 @@ int Embed::load_param(const ParamDict& pd)
  input_dim = pd.get(1, 0);
  bias_term = pd.get(2, 0);
  weight_data_size = pd.get(3, 0);
+ int8_scale_term = pd.get(18, 0);
 
  return 0;
 }
@@ -47,18 +48,23 @@ int Embed::load_model(const ModelBin& mb)
  return -100;
  }
 
+#if NCNN_INT8
+ if (int8_scale_term)
+ {
+ weight_data_int8_scale = mb.load(1, 1)[0];
+ }
+#endif // NCNN_INT8
+
  return 0;
 }
 
-int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
 {
- int words = static_cast<int>(bottom_blob.total());
+ const int num_output = top_blob.w;
+ const int words = top_blob.h;
 
- top_blob.create(num_output, words, 4u, opt.blob_allocator);
- if (top_blob.empty())
- return -100;
+ const float* bias_ptr = bias_data;
 
- // num_output
  #pragma omp parallel for num_threads(opt.num_threads)
  for (int q = 0; q < words; q++)
  {
@@ -73,15 +79,79 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) con
 
  const float* em = (const float*)weight_data + num_output * word_index;
 
- memcpy(outptr, em, num_output * sizeof(float));
+ if (bias_ptr)
+ {
+ for (int p = 0; p < num_output; p++)
+ {
+ outptr[p] = em[p] + bias_ptr[p];
+ }
+ }
+ else
+ {
+ memcpy(outptr, em, num_output * sizeof(float));
+ }
+ }
+}
+
+#if NCNN_INT8
+static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
+{
+ const int num_output = top_blob.w;
+ const int words = top_blob.h;
+
+ const float* bias_ptr = bias_data;
+
+ #pragma omp parallel for num_threads(opt.num_threads)
+ for (int q = 0; q < words; q++)
+ {
+ float* outptr = top_blob.row(q);
+
+ int word_index = ((const int*)bottom_blob)[q];
 
- if (bias_term)
+ if (word_index < 0)
+ word_index = 0;
+ if (word_index >= input_dim)
+ word_index = input_dim - 1;
+
+ const float descale_em = 1.f / weight_data_int8_scale;
+
+ const signed char* em = (const signed char*)weight_data + num_output * word_index;
+
+ if (bias_ptr)
  {
  for (int p = 0; p < num_output; p++)
  {
- outptr[p] += bias_data[p];
+ outptr[p] = em[p] * descale_em + bias_ptr[p];
  }
  }
+ else
+ {
+ for (int p = 0; p < num_output; p++)
+ {
+ outptr[p] = em[p] * descale_em;
+ }
+ }
+ }
+}
+#endif // NCNN_INT8
+
+int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+ int words = static_cast<int>(bottom_blob.total());
+
+ top_blob.create(num_output, words, 4u, opt.blob_allocator);
+ if (top_blob.empty())
+ return -100;
+
+#if NCNN_INT8
+ if (int8_scale_term)
+ {
+ embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt);
+ }
+ else
+#endif // NCNN_INT8
+ {
+ embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt);
  }
 
  return 0;

diff --git a/src/layer/embed.h b/src/layer/embed.h
@@ -38,9 +38,15 @@ class Embed : public Layer
 
  int weight_data_size;
 
+ int int8_scale_term;
+
  // model
  Mat weight_data;
  Mat bias_data;
+
+#if NCNN_INT8
+ float weight_data_int8_scale;
+#endif
 };
 
 } // namespace ncnn

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -101,6 +101,7 @@ ncnn_add_layer_test(Dropout)
 ncnn_add_layer_test(Einsum)
 ncnn_add_layer_test(Eltwise)
 ncnn_add_layer_test(ELU)
+ncnn_add_layer_test(Embed)
 ncnn_add_layer_test(Erf)
 ncnn_add_layer_test(ExpandDims)
 ncnn_add_layer_test(Flatten)

diff --git a/tests/test_embed.cpp b/tests/test_embed.cpp
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "testutil.h"
+
+static int test_embed(int words, int num_output, int input_dim, int bias)
+{
+ ncnn::ParamDict pd;
+ pd.set(0, num_output);
+ pd.set(1, input_dim);
+ pd.set(2, bias);
+ pd.set(3, num_output * input_dim);
+
+ std::vector<ncnn::Mat> weights(bias ? 2 : 1);
+ weights[0] = RandomMat(num_output * input_dim);
+ if (bias)
+ weights[1] = RandomMat(num_output);
+
+ ncnn::Mat a(words);
+ RandomizeInt(a, 0, input_dim);
+
+ int ret = test_layer("Embed", pd, weights, a);
+ if (ret != 0)
+ {
+ fprintf(stderr, "test_embed failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
+ }
+
+ return ret;
+}
+
+static int test_embed_0()
+{
+ return 0
+ || test_embed(128, 128, 128, 0)
+ || test_embed(128, 128, 128, 1)
+ || test_embed(127, 127, 127, 0)
+ || test_embed(127, 127, 127, 1)
+ || test_embed(124, 124, 124, 0)
+ || test_embed(124, 124, 124, 1);
+}
+
+#if NCNN_INT8
+static int test_embed_int8(int words, int num_output, int input_dim, int bias)
+{
+ ncnn::ParamDict pd;
+ pd.set(0, num_output);
+ pd.set(1, input_dim);
+ pd.set(2, bias);
+ pd.set(3, num_output * input_dim);
+ pd.set(18, 2);
+
+ std::vector<ncnn::Mat> weights(bias ? 3 : 2);
+ weights[0] = RandomS8Mat(num_output * input_dim);
+ if (bias)
+ {
+ weights[1] = RandomMat(num_output);
+ weights[2] = RandomMat(1, 100.f, 200.f);
+ }
+ else
+ {
+ weights[1] = RandomMat(1, 100.f, 200.f);
+ }
+
+ ncnn::Mat a(words);
+ RandomizeInt(a, 0, input_dim);
+
+ int ret = test_layer("Embed", pd, weights, a);
+ if (ret != 0)
+ {
+ fprintf(stderr, "test_embed_int8 failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
+ }
+
+ return ret;
+}
+
+static int test_embed_1()
+{
+ return 0
+ || test_embed_int8(128, 128, 128, 0)
+ || test_embed_int8(128, 128, 128, 1)
+ || test_embed_int8(127, 127, 127, 0)
+ || test_embed_int8(127, 127, 127, 1)
+ || test_embed_int8(124, 124, 124, 0)
+ || test_embed_int8(124, 124, 124, 1);
+}
+#endif // NCNN_INT8
+
+int main()
+{
+ SRAND(7767517);
+
+#if NCNN_INT8
+ return test_embed_0() || test_embed_1();
+#else
+ return test_embed_0();
+#endif
+}
diff --git a/tools/modelwriter.h b/tools/modelwriter.h
@@ -1676,9 +1676,20 @@ int ModelWriter::save(const char* parampath, const char* binpath)
  fprintf_param_value(" 1=%d", input_dim)
  fprintf_param_value(" 2=%d", bias_term)
  fprintf_param_value(" 3=%d", weight_data_size)
+ fprintf_param_value(" 18=%d", int8_scale_term)
 
  fwrite_weight_tag_data(op->weight_data, bp);
  fwrite_weight_data(op->bias_data, bp);
+
+#if NCNN_INT8
+ // write int8_scale data
+ if (op->int8_scale_term)
+ {
+ ncnn::Mat weight_data_int8_scales(1);
+ weight_data_int8_scales[0] = op->weight_data_int8_scale;
+ fwrite_weight_data(weight_data_int8_scales, bp, 90, 100);
+ }
+#endif // NCNN_INT8
  }
  else if (layer->type == "Exp")
  {

diff --git a/tools/quantize/ncnn2int8.cpp b/tools/quantize/ncnn2int8.cpp
@@ -133,6 +133,8 @@ class NetQuantize : public ModelWriter
  int quantize_lstm();
  int quantize_gru();
 
+ int quantize_embed();
+
  int fuse_requantize();
 };
 
@@ -562,6 +564,55 @@ int NetQuantize::quantize_gru()
  return 0;
 }
 
+int NetQuantize::quantize_embed()
+{
+ for (size_t i = 0; i < layers.size(); i++)
+ {
+ if (layers[i]->type != "Embed")
+ continue;
+
+ // Embed - quantize weight from fp32 to int8
+ ncnn::Embed* embed = (ncnn::Embed*)layers[i];
+
+ fprintf(stderr, "quantize_embed %s\n", embed->name.c_str());
+
+ // TODO move to ncnn2table
+
+ const int num_output = embed->num_output;
+ const int input_dim = embed->input_dim;
+
+ ncnn::Mat weight_data_int8_scales(1);
+ {
+ const float* ptr = embed->weight_data;
+ float absmax = 0.f;
+ for (int i = 0; i < embed->weight_data.w; i++)
+ {
+ absmax = std::max(absmax, (float)fabs(ptr[i]));
+ }
+
+ weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax;
+ }
+
+ {
+ ncnn::Mat weight_data_int8;
+
+ ncnn::Option opt_q = opt;
+ opt_q.blob_allocator = embed->weight_data.allocator;
+ opt_q.use_packing_layout = false;
+ ncnn::quantize_to_int8(embed->weight_data, weight_data_int8, weight_data_int8_scales, opt_q);
+ if (weight_data_int8.empty())
+ return -100;
+
+ embed->weight_data = weight_data_int8;
+ }
+
+ embed->int8_scale_term = 2;
+ embed->weight_data_int8_scale = weight_data_int8_scales[0];
+ }
+
+ return 0;
+}
+
 int NetQuantize::fuse_requantize()
 {
  const size_t layer_count = layers.size();
@@ -809,6 +860,7 @@ int main(int argc, char** argv)
  quantizer.quantize_rnn();
  quantizer.quantize_lstm();
  quantizer.quantize_gru();
+ quantizer.quantize_embed();
 
  quantizer.fuse_requantize();