Skip to content

Commit

Permalink
fix build with toolchain defined _L _U constants
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Aug 18, 2023
1 parent 39721ee commit 3644f56
Show file tree
Hide file tree
Showing 41 changed files with 403 additions and 399 deletions.
160 changes: 80 additions & 80 deletions src/layer/arm/gru_arm.cpp

Large diffs are not rendered by default.

104 changes: 52 additions & 52 deletions src/layer/arm/gru_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
const __fp16* weight_xc_RUN = weight_xc.row<const __fp16>(q / 4);
const __fp16* weight_hc_RUN = weight_hc.row<const __fp16>(q / 4);

float32x4_t _R = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN));
float32x4_t _U = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 4));
float32x4_t _gru_R = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN));
float32x4_t _gru_U = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 4));
float32x4_t _sum1 = vdupq_n_f32(0.f);
float32x4_t _sum2 = vdupq_n_f32(0.f);
float32x4_t _sum3 = vdupq_n_f32(0.f);
Expand All @@ -78,8 +78,8 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
float32x4_t _weight_xc_U_2 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 20));
float32x4_t _weight_xc_R_3 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 24));
float32x4_t _weight_xc_U_3 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 28));
_R = vfmaq_laneq_f32(_R, _weight_xc_R, _xi, 0);
_U = vfmaq_laneq_f32(_U, _weight_xc_U, _xi, 0);
_gru_R = vfmaq_laneq_f32(_gru_R, _weight_xc_R, _xi, 0);
_gru_U = vfmaq_laneq_f32(_gru_U, _weight_xc_U, _xi, 0);
_sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_R_1, _xi, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_U_1, _xi, 1);
_sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_R_2, _xi, 2);
Expand All @@ -96,8 +96,8 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
float32x4_t _xi = vcvt_f32_f16(vdup_n_f16(xi));
float32x4_t _weight_xc_R = vcvt_f32_f16(vld1_f16(weight_xc_RUN));
float32x4_t _weight_xc_U = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 4));
_R = vmlaq_f32(_R, _weight_xc_R, _xi);
_U = vmlaq_f32(_U, _weight_xc_U, _xi);
_gru_R = vmlaq_f32(_gru_R, _weight_xc_R, _xi);
_gru_U = vmlaq_f32(_gru_U, _weight_xc_U, _xi);

weight_xc_RUN += 8;
}
Expand All @@ -114,8 +114,8 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
float32x4_t _weight_hc_U_2 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 20));
float32x4_t _weight_hc_R_3 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 24));
float32x4_t _weight_hc_U_3 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 28));
_R = vfmaq_laneq_f32(_R, _weight_hc_R, _h_cont, 0);
_U = vfmaq_laneq_f32(_U, _weight_hc_U, _h_cont, 0);
_gru_R = vfmaq_laneq_f32(_gru_R, _weight_hc_R, _h_cont, 0);
_gru_U = vfmaq_laneq_f32(_gru_U, _weight_hc_U, _h_cont, 0);
_sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_R_1, _h_cont, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_U_1, _h_cont, 1);
_sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_R_2, _h_cont, 2);
Expand All @@ -132,26 +132,26 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
float32x4_t _h_cont = vdupq_n_f32(h_cont);
float32x4_t _weight_hc_R = vcvt_f32_f16(vld1_f16(weight_hc_RUN));
float32x4_t _weight_hc_U = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 4));
_R = vmlaq_f32(_R, _weight_hc_R, _h_cont);
_U = vmlaq_f32(_U, _weight_hc_U, _h_cont);
_gru_R = vmlaq_f32(_gru_R, _weight_hc_R, _h_cont);
_gru_U = vmlaq_f32(_gru_U, _weight_hc_U, _h_cont);

weight_hc_RUN += 8;
}

_R = vaddq_f32(_R, _sum1);
_U = vaddq_f32(_U, _sum2);
_gru_R = vaddq_f32(_gru_R, _sum1);
_gru_U = vaddq_f32(_gru_U, _sum2);
_sum3 = vaddq_f32(_sum3, _sum5);
_sum4 = vaddq_f32(_sum4, _sum6);
_R = vaddq_f32(_R, _sum3);
_U = vaddq_f32(_U, _sum4);
_gru_R = vaddq_f32(_gru_R, _sum3);
_gru_U = vaddq_f32(_gru_U, _sum4);

// sigmoid(R)
// sigmoid(U)
_R = sigmoid_ps(_R);
_U = sigmoid_ps(_U);
_gru_R = sigmoid_ps(_gru_R);
_gru_U = sigmoid_ps(_gru_U);

// gate new
float32x4_t _N = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 8));
float32x4_t _gru_N = vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 8));
_sum1 = vdupq_n_f32(0.f);
_sum2 = vdupq_n_f32(0.f);
_sum3 = vdupq_n_f32(0.f);
Expand All @@ -164,7 +164,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
float32x4_t _weight_hc_N_1 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 4));
float32x4_t _weight_hc_N_2 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 8));
float32x4_t _weight_hc_N_3 = vcvt_f32_f16(vld1_f16(weight_hc_RUN + 12));
_N = vfmaq_laneq_f32(_N, _weight_hc_N, _h_cont, 0);
_gru_N = vfmaq_laneq_f32(_gru_N, _weight_hc_N, _h_cont, 0);
_sum1 = vfmaq_laneq_f32(_sum1, _weight_hc_N_1, _h_cont, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _weight_hc_N_2, _h_cont, 2);
_sum3 = vfmaq_laneq_f32(_sum3, _weight_hc_N_3, _h_cont, 3);
Expand All @@ -177,16 +177,16 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M

float32x4_t _h_cont = vdupq_n_f32(h_cont);
float32x4_t _weight_hc_N = vcvt_f32_f16(vld1_f16(weight_hc_RUN));
_N = vmlaq_f32(_N, _weight_hc_N, _h_cont);
_gru_N = vmlaq_f32(_gru_N, _weight_hc_N, _h_cont);

weight_hc_RUN += 4;
}

_N = vaddq_f32(_N, _sum1);
_gru_N = vaddq_f32(_gru_N, _sum1);
_sum2 = vaddq_f32(_sum2, _sum3);
_N = vaddq_f32(_N, _sum2);
_gru_N = vaddq_f32(_gru_N, _sum2);

_N = vmlaq_f32(vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 12)), _R, _N);
_gru_N = vmlaq_f32(vcvt_f32_f16(vld1_f16(bias_c_RUBNWN + 12)), _gru_R, _gru_N);
_sum1 = vdupq_n_f32(0.f);
_sum2 = vdupq_n_f32(0.f);
_sum3 = vdupq_n_f32(0.f);
Expand All @@ -199,7 +199,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
float32x4_t _weight_xc_N_1 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 4));
float32x4_t _weight_xc_N_2 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 8));
float32x4_t _weight_xc_N_3 = vcvt_f32_f16(vld1_f16(weight_xc_RUN + 12));
_N = vfmaq_laneq_f32(_N, _weight_xc_N, _xi, 0);
_gru_N = vfmaq_laneq_f32(_gru_N, _weight_xc_N, _xi, 0);
_sum1 = vfmaq_laneq_f32(_sum1, _weight_xc_N_1, _xi, 1);
_sum2 = vfmaq_laneq_f32(_sum2, _weight_xc_N_2, _xi, 2);
_sum3 = vfmaq_laneq_f32(_sum3, _weight_xc_N_3, _xi, 3);
Expand All @@ -212,22 +212,22 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M

float32x4_t _xi = vcvt_f32_f16(vdup_n_f16(xi));
float32x4_t _weight_xc_N = vcvt_f32_f16(vld1_f16(weight_xc_RUN));
_N = vmlaq_f32(_N, _weight_xc_N, _xi);
_gru_N = vmlaq_f32(_gru_N, _weight_xc_N, _xi);

weight_xc_RUN += 4;
}

_N = vaddq_f32(_N, _sum1);
_gru_N = vaddq_f32(_gru_N, _sum1);
_sum2 = vaddq_f32(_sum2, _sum3);
_N = vaddq_f32(_N, _sum2);
_gru_N = vaddq_f32(_gru_N, _sum2);

// tanh(N)
_N = tanh_ps(_N);
_gru_N = tanh_ps(_gru_N);

float* gates_data = gates.row(q / 4);

vst1q_f32(gates_data, _U);
vst1q_f32(gates_data + 4, _N);
vst1q_f32(gates_data, _gru_U);
vst1q_f32(gates_data + 4, _gru_N);
}
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = remain_num_output_start; q < num_output; q++)
Expand Down Expand Up @@ -314,13 +314,13 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M

const float* gates_data = gates.row(q / 4);

float32x4_t _U = vld1q_f32(gates_data);
float32x4_t _N = vld1q_f32(gates_data + 4);
float32x4_t _gru_U = vld1q_f32(gates_data);
float32x4_t _gru_N = vld1q_f32(gates_data + 4);

float32x4_t _H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _U), _N), vmulq_f32(_U, vld1q_f32(hidden_ptr + q)));
float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));

vst1q_f32(hidden_ptr + q, _H);
vst1_f16(output_data + q, vcvt_f16_f32(_H));
vst1q_f32(hidden_ptr + q, _gru_H);
vst1_f16(output_data + q, vcvt_f16_f32(_gru_H));
}
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = remain_num_output_start; q < num_output; q++)
Expand Down Expand Up @@ -463,7 +463,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
hidden_ptr = hidden_state;

// gate new
float16x4_t _N = vld1_f16(bias_c_RUBNWN + 8);
float16x4_t _gru_N = vld1_f16(bias_c_RUBNWN + 8);
float16x4_t _sum4 = vdup_n_f16((__fp16)0.f);
float16x4_t _sum5 = vdup_n_f16((__fp16)0.f);
float16x4_t _sum6 = vdup_n_f16((__fp16)0.f);
Expand All @@ -481,13 +481,13 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
"fmla %5.4h, v3.4h, v4.h[3] \n"
: "=r"(hidden_ptr),
"=r"(weight_hc_RUN),
"=w"(_N),
"=w"(_gru_N),
"=w"(_sum4),
"=w"(_sum5),
"=w"(_sum6)
: "0"(hidden_ptr),
"1"(weight_hc_RUN),
"2"(_N),
"2"(_gru_N),
"3"(_sum4),
"4"(_sum5),
"5"(_sum6)
Expand All @@ -499,16 +499,16 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const

float16x4_t _h_cont = vdup_n_f16((__fp16)h_cont);
float16x4_t _weight_hc_N = vld1_f16(weight_hc_RUN);
_N = vfma_f16(_N, _weight_hc_N, _h_cont);
_gru_N = vfma_f16(_gru_N, _weight_hc_N, _h_cont);

weight_hc_RUN += 4;
}

_N = vadd_f16(_N, _sum4);
_gru_N = vadd_f16(_gru_N, _sum4);
_sum5 = vadd_f16(_sum5, _sum6);
_N = vadd_f16(_N, _sum5);
_gru_N = vadd_f16(_gru_N, _sum5);

_N = vfma_f16(vld1_f16(bias_c_RUBNWN + 12), vcvt_f16_f32(_R32), _N);
_gru_N = vfma_f16(vld1_f16(bias_c_RUBNWN + 12), vcvt_f16_f32(_R32), _gru_N);
_sum4 = vdup_n_f16((__fp16)0.f);
_sum5 = vdup_n_f16((__fp16)0.f);
_sum6 = vdup_n_f16((__fp16)0.f);
Expand All @@ -525,13 +525,13 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
"fmla %5.4h, v3.4h, v4.h[3] \n"
: "=r"(x),
"=r"(weight_xc_RUN),
"=w"(_N),
"=w"(_gru_N),
"=w"(_sum4),
"=w"(_sum5),
"=w"(_sum6)
: "0"(x),
"1"(weight_xc_RUN),
"2"(_N),
"2"(_gru_N),
"3"(_sum4),
"4"(_sum5),
"5"(_sum6)
Expand All @@ -543,17 +543,17 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const

float16x4_t _xi = vdup_n_f16(xi);
float16x4_t _weight_xc_N = vld1_f16(weight_xc_RUN);
_N = vfma_f16(_N, _weight_xc_N, _xi);
_gru_N = vfma_f16(_gru_N, _weight_xc_N, _xi);

weight_xc_RUN += 4;
}

_N = vadd_f16(_N, _sum4);
_gru_N = vadd_f16(_gru_N, _sum4);
_sum5 = vadd_f16(_sum5, _sum6);
_N = vadd_f16(_N, _sum5);
_gru_N = vadd_f16(_gru_N, _sum5);

// tanh(N)
float32x4_t _N32 = tanh_ps(vcvt_f32_f16(_N));
float32x4_t _N32 = tanh_ps(vcvt_f32_f16(_gru_N));

float* gates_data = gates.row(q / 4);

Expand Down Expand Up @@ -645,13 +645,13 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const

const float* gates_data = gates.row(q / 4);

float32x4_t _U = vld1q_f32(gates_data);
float32x4_t _N = vld1q_f32(gates_data + 4);
float32x4_t _gru_U = vld1q_f32(gates_data);
float32x4_t _gru_N = vld1q_f32(gates_data + 4);

float32x4_t _H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _U), _N), vmulq_f32(_U, vld1q_f32(hidden_ptr + q)));
float32x4_t _gru_H = vaddq_f32(vmulq_f32(vsubq_f32(vdupq_n_f32(1.f), _gru_U), _gru_N), vmulq_f32(_gru_U, vld1q_f32(hidden_ptr + q)));

vst1q_f32(hidden_ptr + q, _H);
vst1_f16(output_data + q, vcvt_f16_f32(_H));
vst1q_f32(hidden_ptr + q, _gru_H);
vst1_f16(output_data + q, vcvt_f16_f32(_gru_H));
}
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = remain_num_output_start; q < num_output; q++)
Expand Down
10 changes: 5 additions & 5 deletions src/layer/arm/interp_bicubic_pack4.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,11 @@ static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, i
float32x4_t _rows1 = vld1q_f32(rows1p);
float32x4_t _rows2 = vld1q_f32(rows2p);
float32x4_t _rows3 = vld1q_f32(rows3p);
float32x4_t _D = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
_D = vmlaq_lane_f32(_D, _rows1, vget_low_f32(_b0123), 1);
_D = vmlaq_lane_f32(_D, _rows2, vget_high_f32(_b0123), 0);
_D = vmlaq_lane_f32(_D, _rows3, vget_high_f32(_b0123), 1);
vst1q_f32(Dp, _D);
float32x4_t _Dp = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
_Dp = vmlaq_lane_f32(_Dp, _rows1, vget_low_f32(_b0123), 1);
_Dp = vmlaq_lane_f32(_Dp, _rows2, vget_high_f32(_b0123), 0);
_Dp = vmlaq_lane_f32(_Dp, _rows3, vget_high_f32(_b0123), 1);
vst1q_f32(Dp, _Dp);

Dp += 4;
rows0p += 4;
Expand Down
10 changes: 5 additions & 5 deletions src/layer/arm/interp_bicubic_pack4_bf16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,11 @@ static void resize_bicubic_image_pack4_bf16s(const Mat& src, Mat& dst, float* al
float32x4_t _rows1 = vld1q_f32(rows1p);
float32x4_t _rows2 = vld1q_f32(rows2p);
float32x4_t _rows3 = vld1q_f32(rows3p);
float32x4_t _D = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
_D = vmlaq_lane_f32(_D, _rows1, vget_low_f32(_b0123), 1);
_D = vmlaq_lane_f32(_D, _rows2, vget_high_f32(_b0123), 0);
_D = vmlaq_lane_f32(_D, _rows3, vget_high_f32(_b0123), 1);
vst1_u16(Dp, float2bfloat(_D));
float32x4_t _Dp = vmulq_lane_f32(_rows0, vget_low_f32(_b0123), 0);
_Dp = vmlaq_lane_f32(_Dp, _rows1, vget_low_f32(_b0123), 1);
_Dp = vmlaq_lane_f32(_Dp, _rows2, vget_high_f32(_b0123), 0);
_Dp = vmlaq_lane_f32(_Dp, _rows3, vget_high_f32(_b0123), 1);
vst1_u16(Dp, float2bfloat(_Dp));

Dp += 4;
rows0p += 4;
Expand Down
20 changes: 10 additions & 10 deletions src/layer/arm/interp_bicubic_pack4_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,11 +253,11 @@ static void resize_bicubic_image_pack4_fp16s(const Mat& src, Mat& dst, float* al
float32x4_t _rows1 = vld1q_f32(rows1p);
float32x4_t _rows2 = vld1q_f32(rows2p);
float32x4_t _rows3 = vld1q_f32(rows3p);
float32x4_t _D = vmulq_laneq_f32(_rows0, _b0123, 0);
_D = vfmaq_laneq_f32(_D, _rows1, _b0123, 1);
_D = vfmaq_laneq_f32(_D, _rows2, _b0123, 2);
_D = vfmaq_laneq_f32(_D, _rows3, _b0123, 3);
vst1_f16(Dp, vcvt_f16_f32(_D));
float32x4_t _Dp = vmulq_laneq_f32(_rows0, _b0123, 0);
_Dp = vfmaq_laneq_f32(_Dp, _rows1, _b0123, 1);
_Dp = vfmaq_laneq_f32(_Dp, _rows2, _b0123, 2);
_Dp = vfmaq_laneq_f32(_Dp, _rows3, _b0123, 3);
vst1_f16(Dp, vcvt_f16_f32(_Dp));

Dp += 4;
rows0p += 4;
Expand Down Expand Up @@ -511,11 +511,11 @@ static void resize_bicubic_image_pack4_fp16sa(const Mat& src, Mat& dst, __fp16*
float16x4_t _rows1 = vld1_f16(rows1p);
float16x4_t _rows2 = vld1_f16(rows2p);
float16x4_t _rows3 = vld1_f16(rows3p);
float16x4_t _D = vmul_lane_f16(_rows0, _b0123, 0);
_D = vfma_lane_f16(_D, _rows1, _b0123, 1);
_D = vfma_lane_f16(_D, _rows2, _b0123, 2);
_D = vfma_lane_f16(_D, _rows3, _b0123, 3);
vst1_f16(Dp, _D);
float16x4_t _Dp = vmul_lane_f16(_rows0, _b0123, 0);
_Dp = vfma_lane_f16(_Dp, _rows1, _b0123, 1);
_Dp = vfma_lane_f16(_Dp, _rows2, _b0123, 2);
_Dp = vfma_lane_f16(_Dp, _rows3, _b0123, 3);
vst1_f16(Dp, _Dp);

Dp += 4;
rows0p += 4;
Expand Down
10 changes: 5 additions & 5 deletions src/layer/arm/interp_bicubic_pack8_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,11 +253,11 @@ static void resize_bicubic_image_pack8_fp16sa(const Mat& src, Mat& dst, __fp16*
float16x8_t _rows1 = vld1q_f16(rows1p);
float16x8_t _rows2 = vld1q_f16(rows2p);
float16x8_t _rows3 = vld1q_f16(rows3p);
float16x8_t _D = vmulq_lane_f16(_rows0, _b0123, 0);
_D = vfmaq_lane_f16(_D, _rows1, _b0123, 1);
_D = vfmaq_lane_f16(_D, _rows2, _b0123, 2);
_D = vfmaq_lane_f16(_D, _rows3, _b0123, 3);
vst1q_f16(Dp, _D);
float16x8_t _Dp = vmulq_lane_f16(_rows0, _b0123, 0);
_Dp = vfmaq_lane_f16(_Dp, _rows1, _b0123, 1);
_Dp = vfmaq_lane_f16(_Dp, _rows2, _b0123, 2);
_Dp = vfmaq_lane_f16(_Dp, _rows3, _b0123, 3);
vst1q_f16(Dp, _Dp);

Dp += 8;
rows0p += 8;
Expand Down
12 changes: 6 additions & 6 deletions src/layer/arm/interp_bilinear.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,18 +193,18 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
float32x4_t _rows0 = vld1q_f32(rows0p);
float32x4_t _rows1 = vld1q_f32(rows1p);

float32x4_t _D = vmulq_f32(_rows0, _b0);
_D = vmlaq_f32(_D, _rows1, _b1);
float32x4_t _Dp = vmulq_f32(_rows0, _b0);
_Dp = vmlaq_f32(_Dp, _rows1, _b1);

vst1q_f32(Dp, _D);
vst1q_f32(Dp, _Dp);

float32x4_t _rows0n = vld1q_f32(rows0p + 4);
float32x4_t _rows1n = vld1q_f32(rows1p + 4);

float32x4_t _Dn = vmulq_f32(_rows0n, _b0);
_Dn = vmlaq_f32(_Dn, _rows1n, _b1);
float32x4_t _Dpn = vmulq_f32(_rows0n, _b0);
_Dpn = vmlaq_f32(_Dpn, _rows1n, _b1);

vst1q_f32(Dp + 4, _Dn);
vst1q_f32(Dp + 4, _Dpn);

Dp += 8;
rows0p += 8;
Expand Down
Loading

0 comments on commit 3644f56

Please sign in to comment.