Added 16-bit version of ADD/SUB operators. Broadcasting is included.

ROCm · Jan 17, 2020 · b94cb47 · b94cb47
1 parent a0c6417
commit b94cb47
Show file tree

Hide file tree

Showing 5 changed files with 147 additions and 44 deletions.
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
@@ -93,12 +93,24 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+  // 8bit -> 8bit general quantized path, with general rescalings
+  // as well as, 16bit -> 16bit with general rescalings
+  bool general_16bit = input1->type == kTfLiteInt16 &&
+                       input2->type == kTfLiteInt16 &&
+                       output->type == kTfLiteInt16;
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      general_16bit) {
     // 8bit -> 8bit general quantized path, with general rescalings
+    // as well as, 16bit -> 16bit with general rescalings
     data->input1_offset = -input1->params.zero_point;
     data->input2_offset = -input2->params.zero_point;
     data->output_offset = output->params.zero_point;
-    data->left_shift = 20;
+
+    // The shift is set to 15 for 16-bit and 20 in case of 8-bit, accordingly.
+    // In case of 16-bit we have 65535 << 15 which is less than 1 << 31,
+    // therefore the addition will still fit in a 32 bit accumulator.
+    data->left_shift = general_16bit ? 15 : 20;
     const double twice_max_input_scale =
         2 * std::max(input1->params.scale, input2->params.scale);
     const double real_input1_multiplier =
@@ -221,7 +233,12 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input1,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+  bool general_16bit = input1->type == kTfLiteInt16 &&
+                       input2->type == kTfLiteInt16 &&
+                       output->type == kTfLiteInt16;
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      general_16bit) {
     tflite::ArithmeticParams op_params;
     op_params.left_shift = data->left_shift;
     op_params.input1_offset = data->input1_offset;
@@ -256,6 +273,12 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
           TF_LITE_ADD(optimized_integer_ops, Add, int8_t);
         }
       }
+    } else if (output->type == kTfLiteInt16) {
+      if (need_broadcast) {
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int16_t);
+      } else {
+        TF_LITE_ADD(reference_ops, Add, int16_t);
+      }
     } else {
       if (kernel_type == kReference) {
         if (need_broadcast) {
@@ -286,7 +309,7 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     // The quantized version of Add doesn't support activations, so we
     // always use BroadcastAdd.
     if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops, Add);
+      TF_LITE_ADD(reference_ops, AddLSTM);
     } else {
       TF_LITE_ADD(optimized_ops, Add);
     }

diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc
@@ -306,15 +306,18 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) {
   const float kMin = -1.f;
   const float kMax = 32767.f / 32768.f;
   float kQuantizedTolerance = GetToleranceInt16(kMin, kMax);
-  std::vector<std::vector<float>> inputs1 = {
-      {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
-  std::vector<std::vector<float>> inputs2 = {
-      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
-  std::vector<std::vector<float>> results = {
-      {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}};
+  std::vector<std::vector<float>> inputs1 = {{0.1, 0.2, 0.3, 0.4, 0.9, 0.7},
+                                             {-0.8, 0.2, 0.4, 0.7, 0.1, 0.0},
+                                             {-0.8, 0.2, 0.7, 0.3, 0.9, 0.1}};
+  std::vector<std::vector<float>> inputs2 = {{0.6, 0.4, 0.3, 0.1, -0.1, 0.3},
+                                             {0.6, 0.4, 0.5, -0.8, 0.0, -1.0},
+                                             {0.6, 0.4, -0.8, 0.5, -0.9, 0.1}};
+  std::vector<std::vector<float>> results = {{0.7, 0.6, 0.6, 0.5, 0.8, 1.0},
+                                             {-0.2, 0.6, 0.9, -0.1, 0.1, -1.0},
+                                             {-0.2, 0.6, -0.1, 0.8, 0.0, 0.2}};
   for (size_t i = 0; i < inputs1.size(); ++i) {
-    QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
-                          {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax},
+    QuantizedAddOpModel m({TensorType_INT16, {1, 2, 3, 1}, kMin, kMax},
+                          {TensorType_INT16, {1, 2, 3, 1}, kMin, kMax},
                           {TensorType_INT16, {}, kMin, kMax},
                           ActivationFunctionType_NONE);
     m.QuantizeAndPopulate<int16_t>(m.input1(), inputs1[i]);
@@ -435,6 +438,10 @@ TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt8) {
   QuantizedWithScalarBroadcast<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithScalarBroadcastInt16) {
+  QuantizedWithScalarBroadcast<TensorType_INT16, int16_t>();
+}
+
 template <enum TensorType tensor_type, typename integer_dtype>
 void QuantizedWithMixedBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
@@ -497,6 +504,10 @@ TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt8) {
   QuantizedWithMixedBroadcast<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithMixedBroadcastInt16) {
+  QuantizedWithMixedBroadcast<TensorType_INT16, int16_t>();
+}
+
 template <enum TensorType tensor_type, typename integer_dtype>
 void QuantizedWithGenericBroadcast() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
@@ -523,5 +534,9 @@ TEST(QuantizedAddOpModel, QuantizedWithGenericdBroadcastInt8) {
   QuantizedWithGenericBroadcast<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedAddOpModel, QuantizedWithGenericdBroadcastInt16) {
+  QuantizedWithGenericBroadcast<TensorType_INT16, int16_t>();
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/reference/add.h b/tensorflow/lite/kernels/internal/reference/add.h
@@ -51,13 +51,18 @@ inline void Add(const ArithmeticParams& params,
 
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
+
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
 inline void AddElementwise(int size, const ArithmeticParams& params,
-                           const uint8* input1_data, const uint8* input2_data,
-                           uint8* output_data) {
-  TFLITE_DCHECK_GT(params.input1_offset, -256);
-  TFLITE_DCHECK_GT(params.input2_offset, -256);
-  TFLITE_DCHECK_LT(params.input1_offset, 256);
-  TFLITE_DCHECK_LT(params.input2_offset, 256);
+                           const T* input1_data, const T* input2_data,
+                           T* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_GT(params.input2_offset, -std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input1_offset, std::numeric_limits<T>::max());
+  TFLITE_DCHECK_LT(params.input2_offset, std::numeric_limits<T>::max());
 
   for (int i = 0; i < size; ++i) {
     const int32 input1_val = params.input1_offset + input1_data[i];
@@ -78,7 +83,7 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, raw_output));
-    output_data[i] = static_cast<uint8>(clamped_output);
+    output_data[i] = static_cast<T>(clamped_output);
   }
 }
 
@@ -138,6 +143,24 @@ inline void Add(const ArithmeticParams& params,
                 const RuntimeShape& output_shape, int16* output_data) {
   TFLITE_DCHECK_LE(params.quantized_activation_min,
                    params.quantized_activation_max);
+  const int flat_size =
+      MatchingElementsSize(input1_shape, input2_shape, output_shape);
+
+  int max_value = std::numeric_limits<int16>::max();
+
+  TFLITE_DCHECK_GT(params.input1_offset, -max_value);
+  TFLITE_DCHECK_GT(params.input2_offset, -max_value);
+  TFLITE_DCHECK_LT(params.input1_offset, max_value);
+  TFLITE_DCHECK_LT(params.input2_offset, max_value);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
+
+inline void AddLSTM(const ArithmeticParams& params,
+                    const RuntimeShape& input1_shape, const int16* input1_data,
+                    const RuntimeShape& input2_shape, const int16* input2_data,
+                    const RuntimeShape& output_shape, int16* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
 
   const int input1_shift = params.input1_shift;
   const int flat_size =
@@ -257,13 +280,14 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
   }
 }
 
-inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
-                               const RuntimeShape& input1_shape,
-                               const uint8* input1_data,
-                               const RuntimeShape& input2_shape,
-                               const uint8* input2_data,
-                               const RuntimeShape& output_shape,
-                               uint8* output_data) {
+// This function is used for 8-bit as well as for 16-bit, but the accumulator
+// is 32-bit for both cases. The overflow does not happen due to the
+// choice of the shift (20 or 15, accordingly - see add.cc for more comments).
+template <typename T>
+inline void BroadcastAdd4DSlow(
+    const ArithmeticParams& params, const RuntimeShape& input1_shape,
+    const T* input1_data, const RuntimeShape& input2_shape,
+    const T* input2_data, const RuntimeShape& output_shape, T* output_data) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
@@ -313,7 +337,7 @@ inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
               std::min(params.quantized_activation_max,
                        std::max(params.quantized_activation_min, raw_output));
           output_data[Offset(extended_output_shape, b, y, x, c)] =
-              static_cast<uint8>(clamped_output);
+              static_cast<T>(clamped_output);
         }
       }
     }

diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
@@ -72,13 +72,14 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
 
-TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
-                              const TfLiteTensor* input_1,
-                              const TfLiteTensor* input_2, TfLiteTensor* output,
-                              TfLiteSubParams* params, OpData* op_params,
-                              int op_sign) {
-  TF_LITE_ENSURE(context,
-                 output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
+TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context,
+                                 const TfLiteTensor* input_1,
+                                 const TfLiteTensor* input_2,
+                                 TfLiteTensor* output, TfLiteSubParams* params,
+                                 OpData* op_params, int op_sign) {
+  TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 ||
+                              output->type == kTfLiteInt8 ||
+                              output->type == kTfLiteInt16);
   const auto& input1_quantization_params = input_1->params;
   const auto& input2_quantization_params = input_2->params;
   const auto& output_quantization_params = output->params;
@@ -87,6 +88,9 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
   if (output->type == kTfLiteUInt8) {
     integer_type_min = std::numeric_limits<uint8_t>::min();
     integer_type_max = std::numeric_limits<uint8_t>::max();
+  } else if (output->type == kTfLiteInt16) {
+    integer_type_min = std::numeric_limits<int16_t>::min();
+    integer_type_max = std::numeric_limits<int16_t>::max();
   } else {
     // output->type == kTfLiteInt8
     integer_type_min = std::numeric_limits<int8_t>::min();
@@ -109,7 +113,11 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
   op_params->input1_offset = -input1_quantization_params.zero_point;
   op_params->input2_offset = -input2_quantization_params.zero_point;
   op_params->output_offset = output_quantization_params.zero_point;
-  op_params->left_shift = 20;
+
+  // The shift is set to 15 in case of 16-bit and 20 in case of 8-bit,
+  // accordingly. In case of 16-bit we have 65535 << 15 which is less than 1 <<
+  // 31, therefore the addition will still fit in a 32 bit accumulator.
+  op_params->left_shift = output->type == kTfLiteInt16 ? 15 : 20;
   const double twice_max_input_scale =
       2 * std::max(input1_quantization_params.scale,
                    input2_quantization_params.scale);
@@ -135,13 +143,14 @@ TfLiteStatus Prepare8BitSubOp(TfLiteContext* context,
   TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
       context, params->activation, output, &op_params->output_activation_min,
       &op_params->output_activation_max));
+
   return kTfLiteOk;
 }
 
-TfLiteStatus PrepareInt16SubOp(TfLiteContext* context,
-                               const TfLiteTensor* input1,
-                               const TfLiteTensor* input2, TfLiteTensor* output,
-                               TfLiteSubParams* params, OpData* data) {
+TfLiteStatus PrepareLSTMSubOp(TfLiteContext* context,
+                              const TfLiteTensor* input1,
+                              const TfLiteTensor* input2, TfLiteTensor* output,
+                              TfLiteSubParams* params, OpData* data) {
   // 16bit -> 16bit special quantized path, supporting only a rather
   // narrow case of quantization parameters: zero_points must all be 0
   // ("symmetric quantization") and scales must be power-of-two (which
@@ -208,12 +217,21 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size = TfLiteIntArrayCopy(input1->dims);
   }
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_OK(context, Prepare8BitSubOp(context, input1, input2, output,
-                                                params, data, -1));
+  // 8bit -> 8bit general quantized path, with general rescalings
+  // as well as, 16bit -> 16bit with general rescalings
+
+  bool general_16bit = output->type == kTfLiteInt16 &&
+                       input1->type == kTfLiteInt16 &&
+                       input2->type == kTfLiteInt16;
+
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      general_16bit) {
+    TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2,
+                                                   output, params, data, -1));
   } else if (output->type == kTfLiteInt16) {
-    TF_LITE_ENSURE_OK(context, PrepareInt16SubOp(context, input1, input2,
-                                                 output, params, data));
+    // LSTM-special case with scale parameter of POT
+    TF_LITE_ENSURE_OK(context, PrepareLSTMSubOp(context, input1, input2, output,
+                                                params, data));
   }
 
   return context->ResizeTensor(context, output, output_size);
@@ -288,6 +306,11 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
       GetTensorShape(input1), GetTensorShape(input2), &op_params);
 
+  // 16bit -> 16bit with general rescaling
+  bool general_16bit = output->type == kTfLiteInt16 &&
+                       input1->type == kTfLiteInt16 &&
+                       input2->type == kTfLiteInt16;
+
 #define TF_LITE_SUB(type, opname, data_type)                             \
   type::opname(op_params, GetTensorShape(input1),                        \
                GetTensorData<data_type>(input1), GetTensorShape(input2), \
@@ -301,6 +324,12 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     } else {
       TF_LITE_SUB(reference_integer_ops, Add, int8_t);
     }
+  } else if (general_16bit) {
+    if (need_broadcast) {
+      TF_LITE_SUB(reference_ops, BroadcastAdd4DSlow, int16_t);
+    } else {
+      TF_LITE_SUB(reference_ops, Add, int16_t);
+    }
   } else if (output->type == kTfLiteUInt8) {
     if (kernel_type == kReference) {
       if (need_broadcast) {

diff --git a/tensorflow/lite/kernels/sub_test.cc b/tensorflow/lite/kernels/sub_test.cc
@@ -226,6 +226,10 @@ TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt8) {
   QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16Generic) {
+  QuantizedTestsNoActivation<TensorType_INT16, int16_t>();
+}
+
 template <TensorType tensor_type, typename integer_dtype>
 void QuantizedTestsActivationRELU_N1_TO_1() {
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
@@ -287,6 +291,10 @@ TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt8) {
   QuantizedVariousInputShapes<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedSubOpModel, QuantizedVariousInputShapesInt16) {
+  QuantizedVariousInputShapes<TensorType_INT16, int16_t>();
+}
+
 template <TensorType tensor_type, typename integer_dtype>
 void QuantizedWithBroadcast() {
   float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
@@ -315,6 +323,10 @@ TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt8) {
   QuantizedWithBroadcast<TensorType_INT8, int8_t>();
 }
 
+TEST(QuantizedSubOpModel, QuantizedWithBroadcastInt16) {
+  QuantizedWithBroadcast<TensorType_INT16, int16_t>();
+}
+
 TEST(QuantizedSubOpModel, QuantizedTestsNoActivationInt16) {
   const float kMin = -1.f;
   const float kMax =