From 37d16b3807088d8b103d5faa79001fe7dc013439 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Mon, 30 Jun 2025 05:20:37 -0700 Subject: [PATCH 1/4] Fix for build issue in greater_lesser_equal on custom cores --- .../nnlib/xa_nn_greater_lesser_equal_f32.c | 117 +++++++++--------- 1 file changed, 59 insertions(+), 58 deletions(-) diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c index 9e51357b1a6..35581a42471 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c @@ -2,6 +2,7 @@ #include "xa_nnlib_common_fpu.h" #include "xa_nn_common.h" #include "xa_nnlib_err_chk.h" +//#include "xa_nn_basic_state.h" #include "xa_nnlib_kernels_api.h" @@ -54,7 +55,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -79,7 +80,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LASX2IP(x2, inp2_a, inp2); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -117,7 +118,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -142,7 +143,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LASX2IP(x2, inp2_a, inp2); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -180,7 +181,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -205,7 +206,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LASX2IP(x2, inp2_a, inp2); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -243,7 +244,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -268,7 +269,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LASX2IP(x2, inp2_a, inp2); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -306,7 +307,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); @@ -331,7 +332,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LASX2IP(x2, inp2_a, inp2); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); @@ -370,7 +371,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); ae_int32x2 store = AE_ZERO32(); AE_MOVF32X2(store, ones, check); @@ -393,7 +394,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, XT_LASX2IP(x2, inp2_a, inp2); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); ae_int32x2 store = AE_ZERO32(); AE_MOVF32X2(store, ones, check); @@ -477,7 +478,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -499,7 +500,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -535,7 +536,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -557,7 +558,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -593,7 +594,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -615,7 +616,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -651,7 +652,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -673,7 +674,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -709,7 +710,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); @@ -731,7 +732,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); @@ -768,7 +769,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); ae_int32x2 store = AE_ZERO32(); AE_MOVF32X2(store, ones, check); @@ -788,7 +789,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); ae_int32x2 store = AE_ZERO32(); AE_MOVF32X2(store, ones, check); @@ -833,7 +834,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -856,7 +857,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -892,7 +893,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -915,7 +916,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -951,7 +952,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -974,7 +975,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1010,7 +1011,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1033,7 +1034,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1069,7 +1070,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); @@ -1092,7 +1093,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); @@ -1129,7 +1130,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); ae_int32x2 store = AE_ZERO32(); AE_MOVF32X2(store, ones, check); @@ -1150,7 +1151,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * XT_LASX2IP(x2, vinp2, p_b); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); ae_int32x2 store = AE_ZERO32(); AE_MOVF32X2(store, ones, check); @@ -1212,7 +1213,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1232,7 +1233,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1266,7 +1267,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1286,7 +1287,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1320,7 +1321,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -1340,7 +1341,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -1374,7 +1375,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -1394,7 +1395,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -1428,7 +1429,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); @@ -1448,7 +1449,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); @@ -1483,7 +1484,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); ae_int32x2 store = AE_ZERO32(); AE_MOVF32X2(store, ones, check); @@ -1501,7 +1502,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); ae_int32x2 store = AE_ZERO32(); AE_MOVF32X2(store, ones, check); @@ -1537,7 +1538,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -1558,7 +1559,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1); + xtbool2 check = XT_OLE_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -1592,7 +1593,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -1613,7 +1614,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1); + xtbool2 check = XT_OLT_SX2(x2, x1); uint8_t val = AE_MOVAB2(check); @@ -1647,7 +1648,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1668,7 +1669,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2); + xtbool2 check = XT_OLE_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1702,7 +1703,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1723,7 +1724,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x1, x2); - xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2); + xtbool2 check = XT_OLT_SX2(x1, x2); uint8_t val = AE_MOVAB2(check); @@ -1757,7 +1758,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); @@ -1778,7 +1779,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r XT_LASX2IP(x1, inp1_a, p_a); //y = XT_SUB_SX2(x2, x1); - xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2); + xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); uint8_t val = AE_MOVAB2(check); From 38b85c06453ec2adfdea73e230aa190bfaa2e60b Mon Sep 17 00:00:00 2001 From: dijopaul Date: Thu, 17 Jul 2025 05:27:29 -0700 Subject: [PATCH 2/4] im2row initial setup --- backends/cadence/hifi/kernels/kernels.h | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 2574b9d60ee..44e2efd88cd 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -289,3 +289,4 @@ void dequantize( }; // namespace HiFi }; // namespace impl }; // namespace cadence + From 31f2dbd794017482b5df45b2623cc43ad3ab0549 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Thu, 17 Jul 2025 05:44:11 -0700 Subject: [PATCH 3/4] Adding im2row in hifi flow with optimization --- backends/cadence/aot/functions_hifi.yaml | 5 + backends/cadence/hifi/kernels/CMakeLists.txt | 1 + backends/cadence/hifi/kernels/kernels.h | 23 +- .../cadence/hifi/operators/CMakeLists.txt | 1 + .../cadence/hifi/operators/im2row_out.cpp | 435 ++++++++++++++++++ .../hifi/third-party/nnlib/xa_nn_im2row.c | 133 ++++++ 6 files changed, 597 insertions(+), 1 deletion(-) create mode 100644 backends/cadence/hifi/operators/im2row_out.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 944967e3cee..851081957ca 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -314,6 +314,11 @@ - arg_meta: null kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out +- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::im2row_out + - func: cadence::quantized_relu_per_tensor.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 972bb4b7ab1..d39ad9769e9 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -18,6 +18,7 @@ add_library( ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 44e2efd88cd..a3a40cd51fb 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -196,6 +196,28 @@ extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32( const unsigned char* __restrict__ p_condition, const WORD32* const p_condition_shape); +extern "C" WORD32 xa_nn_im2row_quantized( + const WORD8* __restrict__ data_im, + const WORD32 in_zero_point, + /* input parameters*/ + const WORD32 channels, + const WORD32 height, + const WORD32 width, + /* output parameters */ + const WORD32 out_height, + const WORD32 out_width, + /* convolution parameters */ + const WORD32 kernel_h, + const WORD32 kernel_w, + const WORD32 pad_h, + const WORD32 pad_w, + const WORD32 stride_h, + const WORD32 stride_w, + const WORD32 dilation_h, + const WORD32 dilation_w, + WORD8* __restrict__ data_col, + WORD32 channels_last); + extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -289,4 +311,3 @@ void dequantize( }; // namespace HiFi }; // namespace impl }; // namespace cadence - diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 92432cdc24c..d55e1a303f5 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -16,6 +16,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake) # ATen compliant ops that are needed to run this model. set(_aten_ops__srcs + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/im2row_out.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_atan2.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_and.cpp" diff --git a/backends/cadence/hifi/operators/im2row_out.cpp b/backends/cadence/hifi/operators/im2row_out.cpp new file mode 100644 index 00000000000..e60c747c9e8 --- /dev/null +++ b/backends/cadence/hifi/operators/im2row_out.cpp @@ -0,0 +1,435 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) + +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +template +__attribute__((always_inline)) void im2row_( + const T* __restrict__ data_im, + const int32_t in_zero_point, + /* input parameters*/ + const int32_t channels, + const int32_t height, + const int32_t width, + /* output parameters */ + const int32_t out_height, + const int32_t out_width, + /* convolution parameters */ + const int32_t kernel_h, + const int32_t kernel_w, + const int32_t pad_h, + const int32_t pad_w, + const int32_t stride_h, + const int32_t stride_w, + const int32_t dilation_h, + const int32_t dilation_w, + T* __restrict__ data_col, + bool channels_last) { + // Consider convolving the input image of dimensions channels * height * width + // (or height * width * channels for NHWC layout) with a filter of dimensions + // channels * kernels_h * kernels_w. Assume that this convolution will produce + // an output of dimensinos out_height x out_width. For each point the output, + // im2row takes the data from the input that is used in the computation of + // that output point, and flattens it into a vector of size channels_col = + // channels * kernel_h * kernel_w. The output of im2row will therefore be a 2D + // array of size (out_height * out_width) x channels_col + const int32_t channels_col = channels * kernel_h * kernel_w; + + // If the layout is NHWC, we can copy 'channels' worth of contiguous data + // points when performing im2row. + if (channels_last) { + // Iterate over the output domain + for (int _h = 0; _h < out_height; ++_h) { + for (int _w = 0; _w < out_width; ++_w) { + int32_t i_col = _h * out_width + _w; + // Each point in the output domain is the result of applying a filter of + // size kernel_h x kernel_w x channels on the input. But since channels + // is contiguous, we will not explicitly have a loop for it. + for (int _kh = 0; _kh < kernel_h; ++_kh) { + int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h; + for (int _kw = 0; _kw < kernel_w; ++_kw) { + int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w; + + // h_im and w_im are the actual height and width coordinates of the + // input tensor from where we need to copy 'channels' points. + const T* __restrict__ slice_im = + data_im + (h_im * width + w_im) * channels; + T* __restrict__ slice_col = data_col + i_col * channels_col + + (_kh * kernel_w + _kw) * channels; + // If the coordinates were within the input domain, we copy + // 'channels' contiguous values. Otherwise we will fill the output + // with 0's. + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + std::memcpy(slice_col, slice_im, channels * sizeof(T)); + } else { + std::fill_n(slice_col, channels, T(in_zero_point)); + } + } + } + } + } + } else { + // Iterate over the output domain + for (int _h = 0; _h < out_height; ++_h) { + for (int _w = 0; _w < out_width; ++_w) { + int32_t i_col = _h * out_width + _w; + + // Each point in the output domain is the result of applying a filter + // of size chanenls * kernel_h x kernel_w on the input + for (int _c = 0; _c < channels; ++_c) { + for (int _kh = 0; _kh < kernel_h; ++_kh) { + for (int _kw = 0; _kw < kernel_w; ++_kw) { + // c_col is the linearized access in the channels_col vector. + int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw; + // h_im and w_im are the actual height and width coordinates of + // the input tensor that we need to copy to the output. + int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h; + int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w; + // If the current data access is within the input tensor, copy the + // value + data_col[i_col * channels_col + c_col] = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) + ? data_im[(_c * height + h_im) * width + w_im] + : static_cast(in_zero_point); + } + } + } + } + } + } +} + +void im2row_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef dilation, + IntArrayRef padding, + IntArrayRef stride, + const Tensor& in_zero_point, + bool channel_last, + Tensor& out) { + // Compute the input tensor's dims + bool unit_height = input.dim() == 3; + const int32_t batch_size = input.size(0); + const int32_t in_c = + channel_last ? input.size(3 - unit_height) : input.size(1); + const int32_t in_h = + unit_height ? 1 : (channel_last ? input.size(1) : input.size(2)); + const int32_t in_w = + channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height); + + // Get the kernel parameters + int32_t kernel_h = kernel_size[0]; + int32_t kernel_w = kernel_size[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + int32_t pad_h = padding[0]; + int32_t pad_w = padding[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + + // If we were to apply a convolution on the input tensor, compute the output + // height and width. + int32_t out_h = + (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1; + int32_t out_w = + (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1; + + ET_DCHECK_MSG( + (out_h * out_w) == out.size(1), "dimension mismatch for output"); + ET_DCHECK_MSG( + (kernel_h * kernel_w * in_c) == out.size(2), + "dimension mismatch for output"); + // Check if the input is per-tensor quantized or per-channel quantized. The + // zero point for each batch could differ for per-channel quantized input. + bool per_tensor_quantized = in_zero_point.numel() == 1; + + bool optimized = false; + if(input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte) + optimized = true; + + if(!optimized) { + WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( + ctx, + ((batch_size * in_c * in_h * in_w) + 8) * + sizeof(WORD8)); + + WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); + + WORD32 p_inp_shape[4]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = in_c; + p_inp_shape[2] = in_h; + p_inp_shape[3] = in_w; + + WORD32 p_out_shape[4]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = in_h; + p_out_shape[2] = in_w; + p_out_shape[3] = in_c; + + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; + + WORD8* __restrict__ p_inp = + (WORD8* __restrict__)input.const_data_ptr(); + + xa_nn_transpose_8_8( + pin, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + 4, // input dimensions + 4); // output dimensions + + const int8_t* __restrict__ in_data = pin; + int8_t* __restrict__ out_data = out.mutable_data_ptr(); + const int32_t* __restrict__ zero_point = + in_zero_point.const_data_ptr(); + int32_t in_plane = in_c * in_h * in_w; + int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; + for (size_t n = 0; n < batch_size; ++n) { + xa_nn_im2row_quantized( + &in_data[n * in_plane], + per_tensor_quantized ? zero_point[0] : zero_point[n], + in_c, + in_h, + in_w, + out_h, + out_w, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + &out_data[n * out_plane], + 1/*channel_last*/); + } + } + else { +#define typed_im2row(dtype, ctype) \ + case ScalarType::dtype: { \ + const ctype* __restrict__ in_data = input.const_data_ptr(); \ + ctype* __restrict__ out_data = out.mutable_data_ptr(); \ + const int32_t* __restrict__ zero_point = \ + in_zero_point.const_data_ptr(); \ + int32_t in_plane = in_c * in_h * in_w; \ + int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; \ + for (size_t n = 0; n < batch_size; ++n) { \ + im2row_( \ + &in_data[n * in_plane], \ + per_tensor_quantized ? zero_point[0] : zero_point[n], \ + in_c, \ + in_h, \ + in_w, \ + out_h, \ + out_w, \ + kernel_h, \ + kernel_w, \ + pad_h, \ + pad_w, \ + stride_h, \ + stride_w, \ + dilation_h, \ + dilation_w, \ + &out_data[n * out_plane], \ + channel_last); \ + } \ + break; \ + } + + ScalarType dtype = input.scalar_type(); + switch (dtype) { + typed_im2row(Float, float); + typed_im2row(Byte, uint8_t); + typed_im2row(Char, int8_t); + default: + ET_DCHECK_MSG( + false, + "im2row not implemented for dtype %s", + torch::executor::toString(dtype)); + } +#undef typed_im2row + } +} + +void im2row_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + IntArrayRef kernel_size, + IntArrayRef dilation, + IntArrayRef padding, + IntArrayRef stride, + int64_t in_zero_point, + bool channel_last, + Tensor& out) { + // Compute the input tensor's dims + bool unit_height = input.dim() == 3; + const int32_t batch_size = input.size(0); + const int32_t in_c = + channel_last ? input.size(3 - unit_height) : input.size(1); + const int32_t in_h = + unit_height ? 1 : (channel_last ? input.size(1) : input.size(2)); + const int32_t in_w = + channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height); + + // Get the kernel parameters + int32_t kernel_h = kernel_size[0]; + int32_t kernel_w = kernel_size[1]; + int32_t dilation_h = dilation[0]; + int32_t dilation_w = dilation[1]; + int32_t pad_h = padding[0]; + int32_t pad_w = padding[1]; + int32_t stride_h = stride[0]; + int32_t stride_w = stride[1]; + + // If we were to apply a convolution on the input tensor, compute the output + // height and width. + int32_t out_h = + (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1; + int32_t out_w = + (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1; + + ET_DCHECK_MSG( + (out_h * out_w) == out.size(1), "dimension mismatch for output"); + ET_DCHECK_MSG( + (kernel_h * kernel_w * in_c) == out.size(2), + "dimension mismatch for output"); + + bool optimized = false; + if(input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte) + optimized = true; + + if(!optimized) { + WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( + ctx, + ((batch_size * in_c * in_h * in_w) + 8) * + sizeof(WORD8)); + + WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); + + WORD32 p_inp_shape[4]; + p_inp_shape[0] = input.size(0); + p_inp_shape[1] = in_c; + p_inp_shape[2] = in_h; + p_inp_shape[3] = in_w; + + WORD32 p_out_shape[4]; + p_out_shape[0] = input.size(0); + p_out_shape[1] = in_h; + p_out_shape[2] = in_w; + p_out_shape[3] = in_c; + + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; + + WORD8* __restrict__ p_inp = + (WORD8* __restrict__)input.const_data_ptr(); + + xa_nn_transpose_8_8( + pin, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + 4, // input dimensions + 4); // output dimensions + + const int8_t* __restrict__ in_data = pin; + int8_t* __restrict__ out_data = out.mutable_data_ptr(); + int32_t in_plane = in_c * in_h * in_w; + int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; + for (size_t n = 0; n < batch_size; ++n) { + xa_nn_im2row_quantized( + &in_data[n * in_plane], + (int32_t)in_zero_point, + in_c, + in_h, + in_w, + out_h, + out_w, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + &out_data[n * out_plane], + 1/*channel_last*/); + } + } + else { +#define typed_im2row_per_tensor(dtype, ctype) \ + case ScalarType::dtype: { \ + const ctype* __restrict__ in_data = input.const_data_ptr(); \ + ctype* __restrict__ out_data = out.mutable_data_ptr(); \ + int32_t in_plane = in_c * in_h * in_w; \ + int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; \ + for (size_t n = 0; n < batch_size; ++n) { \ + im2row_( \ + &in_data[n * in_plane], \ + in_zero_point, \ + in_c, \ + in_h, \ + in_w, \ + out_h, \ + out_w, \ + kernel_h, \ + kernel_w, \ + pad_h, \ + pad_w, \ + stride_h, \ + stride_w, \ + dilation_h, \ + dilation_w, \ + &out_data[n * out_plane], \ + channel_last); \ + } \ + break; \ + } + + ScalarType dtype = input.scalar_type(); + switch (dtype) { + typed_im2row_per_tensor(Float, float); + typed_im2row_per_tensor(Byte, uint8_t); + typed_im2row_per_tensor(Char, int8_t); + default: + ET_DCHECK_MSG( + false, + "im2row.per_tensor not implemented for dtype %s", + torch::executor::toString(dtype)); + } +#undef typed_im2row_per_tensor + } +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c new file mode 100644 index 00000000000..3746991d430 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c @@ -0,0 +1,133 @@ +#include "xa_type_def.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nn_common.h" +#include "xa_nnlib_err_chk.h" +//#include "xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + +WORD32 xa_nn_im2row_quantized( + const WORD8* __restrict__ data_im, + const WORD32 in_zero_point, + /* input parameters*/ + const WORD32 channels, + const WORD32 height, + const WORD32 width, + /* output parameters */ + const WORD32 out_height, + const WORD32 out_width, + /* convolution parameters */ + const WORD32 kernel_h, + const WORD32 kernel_w, + const WORD32 pad_h, + const WORD32 pad_w, + const WORD32 stride_h, + const WORD32 stride_w, + const WORD32 dilation_h, + const WORD32 dilation_w, + WORD8* __restrict__ data_col, + WORD32 channels_last) +{ + const WORD32 channels_col = channels * kernel_h * kernel_w; + + // If the layout is NHWC, we can copy 'channels' worth of contiguous data + // points when performing im2row. + if (channels_last) { + // Iterate over the output domain + for (int _h = 0; _h < out_height; ++_h) { + for (int _w = 0; _w < out_width; ++_w) { + int32_t i_col = _h * out_width + _w; + // Each point in the output domain is the result of applying a filter of + // size kernel_h x kernel_w x channels on the input. But since channels + // is contiguous, we will not explicitly have a loop for it. + for (int _kh = 0; _kh < kernel_h; ++_kh) { + int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h; + for (int _kw = 0; _kw < kernel_w; ++_kw) { + int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w; + + // h_im and w_im are the actual height and width coordinates of the + // input tensor from where we need to copy 'channels' points. + const int8_t* __restrict__ slice_im = + data_im + (h_im * width + w_im) * channels; + int8_t* __restrict__ slice_col = data_col + i_col * channels_col + + (_kh * kernel_w + _kw) * channels; + // If the coordinates were within the input domain, we copy + // 'channels' contiguous values. Otherwise we will fill the output + // with 0's. + if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + const ae_int24x2 *pae_inp = (const ae_int24x2 *)slice_im; + ae_int24x2 *pae_out = (ae_int24x2 *)slice_col; + ae_valign inp_a, out_a; + inp_a = AE_LA64_PP(pae_inp); + out_a = AE_ZALIGN64(); + + int ic; + for(ic = 0; ic < channels; ic += 6) + { + ae_int24x2 d0; + AE_LA24X2_IP(d0, inp_a, pae_inp); + AE_SA24X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + for(int i = ic; i < (channels & 5); i++) + { + slice_col[i] = slice_im[i]; + } + } + else { + ae_int24x2 *pae_out = (ae_int24x2 *)slice_col; + ae_valign out_a; + out_a = AE_ZALIGN64(); + + ae_int32x2 tmp = AE_MOVDA32(in_zero_point); + ae_int32x2 in_zero_point32x2 = AE_SLLI32(tmp, 8); + in_zero_point32x2 = AE_OR32(in_zero_point32x2, tmp); + in_zero_point32x2 = AE_SLLI32(in_zero_point32x2, 8); + in_zero_point32x2 = AE_OR32(in_zero_point32x2, in_zero_point32x2); + + ae_int24x2 d0 = AE_MOVINT24X2_FROMINT32X2(in_zero_point32x2); + int ic; + for(ic = 0; ic < channels; ic += 6) + { + AE_SA24X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + for(int i = ic; ic < (channels & 5); i++) + { + slice_col[i] = (int8_t)(in_zero_point); + } + } + } + } + } + } + } else { + // Iterate over the output domain + for (int _h = 0; _h < out_height; ++_h) { + for (int _w = 0; _w < out_width; ++_w) { + int32_t i_col = _h * out_width + _w; + + // Each point in the output domain is the result of applying a filter + // of size chanenls * kernel_h x kernel_w on the input + for (int _c = 0; _c < channels; ++_c) { + for (int _kh = 0; _kh < kernel_h; ++_kh) { + for (int _kw = 0; _kw < kernel_w; ++_kw) { + // c_col is the linearized access in the channels_col vector. + int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw; + // h_im and w_im are the actual height and width coordinates of + // the input tensor that we need to copy to the output. + int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h; + int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w; + // If the current data access is within the input tensor, copy the + // value + data_col[i_col * channels_col + c_col] = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) + ? data_im[(_c * height + h_im) * width + w_im] + : (int8_t)(in_zero_point); + } + } + } + } + } + } + return 0; +} From 589e14b6ea7e34cdfb025d4440df72bb230c0526 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Thu, 17 Jul 2025 09:26:19 -0700 Subject: [PATCH 4/4] Fixing lint error --- .../cadence/hifi/operators/im2row_out.cpp | 202 +++++++++--------- 1 file changed, 99 insertions(+), 103 deletions(-) diff --git a/backends/cadence/hifi/operators/im2row_out.cpp b/backends/cadence/hifi/operators/im2row_out.cpp index e60c747c9e8..4793a36fec4 100644 --- a/backends/cadence/hifi/operators/im2row_out.cpp +++ b/backends/cadence/hifi/operators/im2row_out.cpp @@ -167,16 +167,15 @@ void im2row_out( bool per_tensor_quantized = in_zero_point.numel() == 1; bool optimized = false; - if(input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte) + if (input.scalar_type() == ScalarType::Char || + input.scalar_type() == ScalarType::Byte) optimized = true; - if(!optimized) { + if (!optimized) { WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( - ctx, - ((batch_size * in_c * in_h * in_w) + 8) * - sizeof(WORD8)); + ctx, ((batch_size * in_c * in_h * in_w) + 8) * sizeof(WORD8)); - WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); + WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); WORD32 p_inp_shape[4]; p_inp_shape[0] = input.size(0); @@ -189,49 +188,48 @@ void im2row_out( p_out_shape[1] = in_h; p_out_shape[2] = in_w; p_out_shape[3] = in_c; - + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; WORD8* __restrict__ p_inp = - (WORD8* __restrict__)input.const_data_ptr(); + (WORD8* __restrict__)input.const_data_ptr(); xa_nn_transpose_8_8( - pin, - p_out_shape, - p_inp, - p_inp_shape, - p_permute_vec, - 4, // input dimensions - 4); // output dimensions + pin, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + 4, // input dimensions + 4); // output dimensions const int8_t* __restrict__ in_data = pin; - int8_t* __restrict__ out_data = out.mutable_data_ptr(); - const int32_t* __restrict__ zero_point = - in_zero_point.const_data_ptr(); - int32_t in_plane = in_c * in_h * in_w; - int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; - for (size_t n = 0; n < batch_size; ++n) { - xa_nn_im2row_quantized( - &in_data[n * in_plane], - per_tensor_quantized ? zero_point[0] : zero_point[n], - in_c, - in_h, - in_w, - out_h, - out_w, - kernel_h, - kernel_w, - pad_h, - pad_w, - stride_h, - stride_w, - dilation_h, - dilation_w, - &out_data[n * out_plane], - 1/*channel_last*/); + int8_t* __restrict__ out_data = out.mutable_data_ptr(); + const int32_t* __restrict__ zero_point = + in_zero_point.const_data_ptr(); + int32_t in_plane = in_c * in_h * in_w; + int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; + for (size_t n = 0; n < batch_size; ++n) { + xa_nn_im2row_quantized( + &in_data[n * in_plane], + per_tensor_quantized ? zero_point[0] : zero_point[n], + in_c, + in_h, + in_w, + out_h, + out_w, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + &out_data[n * out_plane], + 1 /*channel_last*/); } - } - else { + } else { #define typed_im2row(dtype, ctype) \ case ScalarType::dtype: { \ const ctype* __restrict__ in_data = input.const_data_ptr(); \ @@ -263,17 +261,17 @@ void im2row_out( break; \ } - ScalarType dtype = input.scalar_type(); - switch (dtype) { - typed_im2row(Float, float); - typed_im2row(Byte, uint8_t); - typed_im2row(Char, int8_t); - default: - ET_DCHECK_MSG( - false, - "im2row not implemented for dtype %s", - torch::executor::toString(dtype)); - } + ScalarType dtype = input.scalar_type(); + switch (dtype) { + typed_im2row(Float, float); + typed_im2row(Byte, uint8_t); + typed_im2row(Char, int8_t); + default: + ET_DCHECK_MSG( + false, + "im2row not implemented for dtype %s", + torch::executor::toString(dtype)); + } #undef typed_im2row } } @@ -320,18 +318,17 @@ void im2row_per_tensor_out( ET_DCHECK_MSG( (kernel_h * kernel_w * in_c) == out.size(2), "dimension mismatch for output"); - + bool optimized = false; - if(input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte) + if (input.scalar_type() == ScalarType::Char || + input.scalar_type() == ScalarType::Byte) optimized = true; - if(!optimized) { + if (!optimized) { WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( - ctx, - ((batch_size * in_c * in_h * in_w) + 8) * - sizeof(WORD8)); + ctx, ((batch_size * in_c * in_h * in_w) + 8) * sizeof(WORD8)); - WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); + WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); WORD32 p_inp_shape[4]; p_inp_shape[0] = input.size(0); @@ -344,47 +341,46 @@ void im2row_per_tensor_out( p_out_shape[1] = in_h; p_out_shape[2] = in_w; p_out_shape[3] = in_c; - + WORD32 p_permute_vec[4] = {0, 2, 3, 1}; WORD8* __restrict__ p_inp = - (WORD8* __restrict__)input.const_data_ptr(); + (WORD8* __restrict__)input.const_data_ptr(); xa_nn_transpose_8_8( - pin, - p_out_shape, - p_inp, - p_inp_shape, - p_permute_vec, - 4, // input dimensions - 4); // output dimensions + pin, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + 4, // input dimensions + 4); // output dimensions const int8_t* __restrict__ in_data = pin; - int8_t* __restrict__ out_data = out.mutable_data_ptr(); - int32_t in_plane = in_c * in_h * in_w; - int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; - for (size_t n = 0; n < batch_size; ++n) { - xa_nn_im2row_quantized( - &in_data[n * in_plane], - (int32_t)in_zero_point, - in_c, - in_h, - in_w, - out_h, - out_w, - kernel_h, - kernel_w, - pad_h, - pad_w, - stride_h, - stride_w, - dilation_h, - dilation_w, - &out_data[n * out_plane], - 1/*channel_last*/); + int8_t* __restrict__ out_data = out.mutable_data_ptr(); + int32_t in_plane = in_c * in_h * in_w; + int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w; + for (size_t n = 0; n < batch_size; ++n) { + xa_nn_im2row_quantized( + &in_data[n * in_plane], + (int32_t)in_zero_point, + in_c, + in_h, + in_w, + out_h, + out_w, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + &out_data[n * out_plane], + 1 /*channel_last*/); } - } - else { + } else { #define typed_im2row_per_tensor(dtype, ctype) \ case ScalarType::dtype: { \ const ctype* __restrict__ in_data = input.const_data_ptr(); \ @@ -414,17 +410,17 @@ void im2row_per_tensor_out( break; \ } - ScalarType dtype = input.scalar_type(); - switch (dtype) { - typed_im2row_per_tensor(Float, float); - typed_im2row_per_tensor(Byte, uint8_t); - typed_im2row_per_tensor(Char, int8_t); - default: - ET_DCHECK_MSG( - false, - "im2row.per_tensor not implemented for dtype %s", - torch::executor::toString(dtype)); - } + ScalarType dtype = input.scalar_type(); + switch (dtype) { + typed_im2row_per_tensor(Float, float); + typed_im2row_per_tensor(Byte, uint8_t); + typed_im2row_per_tensor(Char, int8_t); + default: + ET_DCHECK_MSG( + false, + "im2row.per_tensor not implemented for dtype %s", + torch::executor::toString(dtype)); + } #undef typed_im2row_per_tensor } }