From 37d16b3807088d8b103d5faa79001fe7dc013439 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Mon, 30 Jun 2025 05:20:37 -0700
Subject: [PATCH 1/4] Fix for build issue in greater_lesser_equal on custom
 cores

---
 .../nnlib/xa_nn_greater_lesser_equal_f32.c    | 117 +++++++++---------
 1 file changed, 59 insertions(+), 58 deletions(-)

diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
index 9e51357b1a6..35581a42471 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
@@ -2,6 +2,7 @@
 #include "xa_nnlib_common_fpu.h"
 #include "xa_nn_common.h"
 #include "xa_nnlib_err_chk.h"
+//#include "xa_nn_basic_state.h"
 #include "xa_nnlib_kernels_api.h"
 
 
@@ -54,7 +55,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
               XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
               
               //y = XT_SUB_SX2(x2, x1);
-              xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+              xtbool2 check = XT_OLE_SX2(x2, x1);
               
               uint8_t val = AE_MOVAB2(check);
               
@@ -79,7 +80,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
             XT_LASX2IP(x2, inp2_a, inp2);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLE_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -117,7 +118,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
               XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
               
               //y = XT_SUB_SX2(x2, x1);
-              xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+              xtbool2 check = XT_OLT_SX2(x2, x1);
               
               uint8_t val = AE_MOVAB2(check);
               
@@ -142,7 +143,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
             XT_LASX2IP(x2, inp2_a, inp2);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLT_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -180,7 +181,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
               XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
               
               //y = XT_SUB_SX2(x1, x2);
-              xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+              xtbool2 check = XT_OLE_SX2(x1, x2);
               
               uint8_t val = AE_MOVAB2(check);
               
@@ -205,7 +206,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
             XT_LASX2IP(x2, inp2_a, inp2);
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLE_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -243,7 +244,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
               XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
               
               //y = XT_SUB_SX2(x1, x2);
-              xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+              xtbool2 check = XT_OLT_SX2(x1, x2);
               
               uint8_t val = AE_MOVAB2(check);
               
@@ -268,7 +269,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
             XT_LASX2IP(x2, inp2_a, inp2);
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLT_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -306,7 +307,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
               XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
               
               //y = XT_SUB_SX2(x2, x1);
-              xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+              xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
               
               uint8_t val = AE_MOVAB2(check);
               
@@ -331,7 +332,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
             XT_LASX2IP(x2, inp2_a, inp2);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -370,7 +371,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
               XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
               
               //y = XT_SUB_SX2(x2, x1);
-              xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+              xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
               
               ae_int32x2 store = AE_ZERO32();
               AE_MOVF32X2(store, ones, check);
@@ -393,7 +394,7 @@ WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
             XT_LASX2IP(x2, inp2_a, inp2);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             ae_int32x2 store = AE_ZERO32();
             AE_MOVF32X2(store, ones, check);
@@ -477,7 +478,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLE_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -499,7 +500,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLE_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -535,7 +536,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLT_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -557,7 +558,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLT_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -593,7 +594,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLE_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -615,7 +616,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLE_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -651,7 +652,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLT_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -673,7 +674,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLT_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -709,7 +710,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -731,7 +732,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -768,7 +769,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             ae_int32x2 store = AE_ZERO32();
             AE_MOVF32X2(store, ones, check);
@@ -788,7 +789,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             ae_int32x2 store = AE_ZERO32();
             AE_MOVF32X2(store, ones, check);
@@ -833,7 +834,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLE_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -856,7 +857,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLE_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -892,7 +893,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLT_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -915,7 +916,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+            xtbool2 check = XT_OLT_SX2(x2, x1);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -951,7 +952,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLE_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -974,7 +975,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLE_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -1010,7 +1011,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLT_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -1033,7 +1034,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+            xtbool2 check = XT_OLT_SX2(x1, x2);
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -1069,7 +1070,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -1092,7 +1093,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             uint8_t val = AE_MOVAB2(check);
             
@@ -1129,7 +1130,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             ae_int32x2 store = AE_ZERO32();
             AE_MOVF32X2(store, ones, check);
@@ -1150,7 +1151,7 @@ static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 *
             XT_LASX2IP(x2, vinp2, p_b);
             
             //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
             
             ae_int32x2 store = AE_ZERO32();
             AE_MOVF32X2(store, ones, check);
@@ -1212,7 +1213,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+          xtbool2 check = XT_OLE_SX2(x1, x2);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1232,7 +1233,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+          xtbool2 check = XT_OLE_SX2(x1, x2);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1266,7 +1267,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+          xtbool2 check = XT_OLT_SX2(x1, x2);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1286,7 +1287,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+          xtbool2 check = XT_OLT_SX2(x1, x2);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1320,7 +1321,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+          xtbool2 check = XT_OLE_SX2(x2, x1);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1340,7 +1341,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+          xtbool2 check = XT_OLE_SX2(x2, x1);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1374,7 +1375,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+          xtbool2 check = XT_OLT_SX2(x2, x1);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1394,7 +1395,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+          xtbool2 check = XT_OLT_SX2(x2, x1);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1428,7 +1429,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1448,7 +1449,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1483,7 +1484,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
           
           ae_int32x2 store = AE_ZERO32();
           AE_MOVF32X2(store, ones, check);
@@ -1501,7 +1502,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
           
           ae_int32x2 store = AE_ZERO32();
           AE_MOVF32X2(store, ones, check);
@@ -1537,7 +1538,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+          xtbool2 check = XT_OLE_SX2(x2, x1);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1558,7 +1559,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_LE_xtfloatx2(x2, x1);
+          xtbool2 check = XT_OLE_SX2(x2, x1);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1592,7 +1593,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+          xtbool2 check = XT_OLT_SX2(x2, x1);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1613,7 +1614,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_LT_xtfloatx2(x2, x1);
+          xtbool2 check = XT_OLT_SX2(x2, x1);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1647,7 +1648,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+          xtbool2 check = XT_OLE_SX2(x1, x2);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1668,7 +1669,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_LE_xtfloatx2(x1, x2);
+          xtbool2 check = XT_OLE_SX2(x1, x2);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1702,7 +1703,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x1, x2);
           
-          xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+          xtbool2 check = XT_OLT_SX2(x1, x2);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1723,7 +1724,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x1, x2);
           
-        xtbool2 check = xtfloatx2_LT_xtfloatx2(x1, x2);
+        xtbool2 check = XT_OLT_SX2(x1, x2);
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1757,7 +1758,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
           
           uint8_t val = AE_MOVAB2(check);
           
@@ -1778,7 +1779,7 @@ static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __r
           XT_LASX2IP(x1, inp1_a, p_a);
           //y = XT_SUB_SX2(x2, x1);
           
-          xtbool2 check = xtfloatx2_EQ_xtfloatx2(x1, x2);
+          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
           
           uint8_t val = AE_MOVAB2(check);
           

From 38b85c06453ec2adfdea73e230aa190bfaa2e60b Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Thu, 17 Jul 2025 05:27:29 -0700
Subject: [PATCH 2/4] im2row initial setup

---
 backends/cadence/hifi/kernels/kernels.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 2574b9d60ee..44e2efd88cd 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -289,3 +289,4 @@ void dequantize(
 }; // namespace HiFi
 }; // namespace impl
 }; // namespace cadence
+

From 31f2dbd794017482b5df45b2623cc43ad3ab0549 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Thu, 17 Jul 2025 05:44:11 -0700
Subject: [PATCH 3/4] Adding im2row in hifi flow with optimization

---
 backends/cadence/aot/functions_hifi.yaml      |   5 +
 backends/cadence/hifi/kernels/CMakeLists.txt  |   1 +
 backends/cadence/hifi/kernels/kernels.h       |  23 +-
 .../cadence/hifi/operators/CMakeLists.txt     |   1 +
 .../cadence/hifi/operators/im2row_out.cpp     | 435 ++++++++++++++++++
 .../hifi/third-party/nnlib/xa_nn_im2row.c     | 133 ++++++
 6 files changed, 597 insertions(+), 1 deletion(-)
 create mode 100644 backends/cadence/hifi/operators/im2row_out.cpp
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c

diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 944967e3cee..851081957ca 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -314,6 +314,11 @@
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_linear_per_tensor_out
 
+- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::im2row_out
+
 - func: cadence::quantized_relu_per_tensor.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 972bb4b7ab1..d39ad9769e9 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -18,6 +18,7 @@ add_library(
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 44e2efd88cd..a3a40cd51fb 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -196,6 +196,28 @@ extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(
     const unsigned char* __restrict__ p_condition,
     const WORD32* const p_condition_shape);
 
+extern "C" WORD32 xa_nn_im2row_quantized(
+    const WORD8* __restrict__ data_im,
+    const WORD32 in_zero_point,
+    /* input parameters*/
+    const WORD32 channels,
+    const WORD32 height,
+    const WORD32 width,
+    /* output parameters */
+    const WORD32 out_height,
+    const WORD32 out_width,
+    /* convolution parameters */
+    const WORD32 kernel_h,
+    const WORD32 kernel_w,
+    const WORD32 pad_h,
+    const WORD32 pad_w,
+    const WORD32 stride_h,
+    const WORD32 stride_w,
+    const WORD32 dilation_h,
+    const WORD32 dilation_w,
+    WORD8* __restrict__ data_col,
+    WORD32 channels_last);
+
 extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
@@ -289,4 +311,3 @@ void dequantize(
 }; // namespace HiFi
 }; // namespace impl
 }; // namespace cadence
-
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index 92432cdc24c..d55e1a303f5 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -16,6 +16,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Codegen.cmake)
 
 # ATen compliant ops that are needed to run this model.
 set(_aten_ops__srcs
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/im2row_out.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_atan2.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_bitwise_and.cpp"
diff --git a/backends/cadence/hifi/operators/im2row_out.cpp b/backends/cadence/hifi/operators/im2row_out.cpp
new file mode 100644
index 00000000000..e60c747c9e8
--- /dev/null
+++ b/backends/cadence/hifi/operators/im2row_out.cpp
@@ -0,0 +1,435 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/dtype_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/kernel_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+#include <algorithm>
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+template <typename T>
+__attribute__((always_inline)) void im2row_(
+    const T* __restrict__ data_im,
+    const int32_t in_zero_point,
+    /* input parameters*/
+    const int32_t channels,
+    const int32_t height,
+    const int32_t width,
+    /* output parameters */
+    const int32_t out_height,
+    const int32_t out_width,
+    /* convolution parameters */
+    const int32_t kernel_h,
+    const int32_t kernel_w,
+    const int32_t pad_h,
+    const int32_t pad_w,
+    const int32_t stride_h,
+    const int32_t stride_w,
+    const int32_t dilation_h,
+    const int32_t dilation_w,
+    T* __restrict__ data_col,
+    bool channels_last) {
+  // Consider convolving the input image of dimensions channels * height * width
+  // (or height * width * channels for NHWC layout) with a filter of dimensions
+  // channels * kernels_h * kernels_w. Assume that this convolution will produce
+  // an output of dimensinos out_height x out_width. For each point the output,
+  // im2row takes the data from the input that is used in the computation of
+  // that output point, and flattens it into a vector of size channels_col =
+  // channels * kernel_h * kernel_w. The output of im2row will therefore be a 2D
+  // array of size (out_height * out_width) x channels_col
+  const int32_t channels_col = channels * kernel_h * kernel_w;
+
+  // If the layout is NHWC, we can copy 'channels' worth of contiguous data
+  // points when performing im2row.
+  if (channels_last) {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+        // Each point in the output domain is the result of applying a filter of
+        // size kernel_h x kernel_w x channels on the input. But since channels
+        // is contiguous, we will not explicitly have a loop for it.
+        for (int _kh = 0; _kh < kernel_h; ++_kh) {
+          int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+          for (int _kw = 0; _kw < kernel_w; ++_kw) {
+            int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+
+            // h_im and w_im are the actual height and width coordinates of the
+            // input tensor from where we need to copy 'channels' points.
+            const T* __restrict__ slice_im =
+                data_im + (h_im * width + w_im) * channels;
+            T* __restrict__ slice_col = data_col + i_col * channels_col +
+                (_kh * kernel_w + _kw) * channels;
+            // If the coordinates were within the input domain, we copy
+            // 'channels' contiguous values. Otherwise we will fill the output
+            // with 0's.
+            if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+              std::memcpy(slice_col, slice_im, channels * sizeof(T));
+            } else {
+              std::fill_n(slice_col, channels, T(in_zero_point));
+            }
+          }
+        }
+      }
+    }
+  } else {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+
+        // Each point in the output domain is the result of applying a filter
+        // of size chanenls * kernel_h x kernel_w on the input
+        for (int _c = 0; _c < channels; ++_c) {
+          for (int _kh = 0; _kh < kernel_h; ++_kh) {
+            for (int _kw = 0; _kw < kernel_w; ++_kw) {
+              // c_col is the linearized access in the channels_col vector.
+              int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw;
+              // h_im and w_im are the actual height and width coordinates of
+              // the input tensor that we need to copy to the output.
+              int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+              int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+              // If the current data access is within the input tensor, copy the
+              // value
+              data_col[i_col * channels_col + c_col] =
+                  (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
+                  ? data_im[(_c * height + h_im) * width + w_im]
+                  : static_cast<T>(in_zero_point);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void im2row_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    const Tensor& in_zero_point,
+    bool channel_last,
+    Tensor& out) {
+  // Compute the input tensor's dims
+  bool unit_height = input.dim() == 3;
+  const int32_t batch_size = input.size(0);
+  const int32_t in_c =
+      channel_last ? input.size(3 - unit_height) : input.size(1);
+  const int32_t in_h =
+      unit_height ? 1 : (channel_last ? input.size(1) : input.size(2));
+  const int32_t in_w =
+      channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height);
+
+  // Get the kernel parameters
+  int32_t kernel_h = kernel_size[0];
+  int32_t kernel_w = kernel_size[1];
+  int32_t dilation_h = dilation[0];
+  int32_t dilation_w = dilation[1];
+  int32_t pad_h = padding[0];
+  int32_t pad_w = padding[1];
+  int32_t stride_h = stride[0];
+  int32_t stride_w = stride[1];
+
+  // If we were to apply a convolution on the input tensor, compute the output
+  // height and width.
+  int32_t out_h =
+      (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+  int32_t out_w =
+      (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+
+  ET_DCHECK_MSG(
+      (out_h * out_w) == out.size(1), "dimension mismatch for output");
+  ET_DCHECK_MSG(
+      (kernel_h * kernel_w * in_c) == out.size(2),
+      "dimension mismatch for output");
+  // Check if the input is per-tensor quantized or per-channel quantized. The
+  // zero point for each batch could differ for per-channel quantized input.
+  bool per_tensor_quantized = in_zero_point.numel() == 1;
+
+  bool optimized = false;
+  if(input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte)
+    optimized = true;
+
+  if(!optimized) {
+    WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
+          ctx,
+          ((batch_size * in_c * in_h * in_w) + 8) *
+              sizeof(WORD8));
+
+    WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);              
+
+    WORD32 p_inp_shape[4];
+    p_inp_shape[0] = input.size(0);
+    p_inp_shape[1] = in_c;
+    p_inp_shape[2] = in_h;
+    p_inp_shape[3] = in_w;
+
+    WORD32 p_out_shape[4];
+    p_out_shape[0] = input.size(0);
+    p_out_shape[1] = in_h;
+    p_out_shape[2] = in_w;
+    p_out_shape[3] = in_c;
+    
+    WORD32 p_permute_vec[4] = {0, 2, 3, 1};
+
+    WORD8* __restrict__ p_inp =
+          (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+
+    xa_nn_transpose_8_8(
+            pin,
+            p_out_shape,
+            p_inp,
+            p_inp_shape,
+            p_permute_vec,
+            4, // input dimensions
+            4); // output dimensions
+
+    const int8_t* __restrict__ in_data = pin;
+    int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();      
+    const int32_t* __restrict__ zero_point =                           
+        in_zero_point.const_data_ptr<int32_t>();                       
+    int32_t in_plane = in_c * in_h * in_w;                             
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    
+    for (size_t n = 0; n < batch_size; ++n) {  
+        xa_nn_im2row_quantized(                                                 
+          &in_data[n * in_plane],                                      
+          per_tensor_quantized ? zero_point[0] : zero_point[n],        
+          in_c,                                                        
+          in_h,                                                        
+          in_w,                                                        
+          out_h,                                                       
+          out_w,                                                       
+          kernel_h,                                                    
+          kernel_w,                                                    
+          pad_h,                                                       
+          pad_w,                                                       
+          stride_h,                                                    
+          stride_w,                                                    
+          dilation_h,                                                  
+          dilation_w,                                                  
+          &out_data[n * out_plane],                                    
+          1/*channel_last*/);
+    }
+  }
+  else {
+#define typed_im2row(dtype, ctype)                                     \
+  case ScalarType::dtype: {                                            \
+    const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
+    ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>();      \
+    const int32_t* __restrict__ zero_point =                           \
+        in_zero_point.const_data_ptr<int32_t>();                       \
+    int32_t in_plane = in_c * in_h * in_w;                             \
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    \
+    for (size_t n = 0; n < batch_size; ++n) {                          \
+      im2row_<ctype>(                                                  \
+          &in_data[n * in_plane],                                      \
+          per_tensor_quantized ? zero_point[0] : zero_point[n],        \
+          in_c,                                                        \
+          in_h,                                                        \
+          in_w,                                                        \
+          out_h,                                                       \
+          out_w,                                                       \
+          kernel_h,                                                    \
+          kernel_w,                                                    \
+          pad_h,                                                       \
+          pad_w,                                                       \
+          stride_h,                                                    \
+          stride_w,                                                    \
+          dilation_h,                                                  \
+          dilation_w,                                                  \
+          &out_data[n * out_plane],                                    \
+          channel_last);                                               \
+    }                                                                  \
+    break;                                                             \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    typed_im2row(Float, float);
+    typed_im2row(Byte, uint8_t);
+    typed_im2row(Char, int8_t);
+    default:
+      ET_DCHECK_MSG(
+          false,
+          "im2row not implemented for dtype %s",
+          torch::executor::toString(dtype));
+  }
+#undef typed_im2row
+  }
+}
+
+void im2row_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    IntArrayRef kernel_size,
+    IntArrayRef dilation,
+    IntArrayRef padding,
+    IntArrayRef stride,
+    int64_t in_zero_point,
+    bool channel_last,
+    Tensor& out) {
+  // Compute the input tensor's dims
+  bool unit_height = input.dim() == 3;
+  const int32_t batch_size = input.size(0);
+  const int32_t in_c =
+      channel_last ? input.size(3 - unit_height) : input.size(1);
+  const int32_t in_h =
+      unit_height ? 1 : (channel_last ? input.size(1) : input.size(2));
+  const int32_t in_w =
+      channel_last ? input.size(2 - unit_height) : input.size(3 - unit_height);
+
+  // Get the kernel parameters
+  int32_t kernel_h = kernel_size[0];
+  int32_t kernel_w = kernel_size[1];
+  int32_t dilation_h = dilation[0];
+  int32_t dilation_w = dilation[1];
+  int32_t pad_h = padding[0];
+  int32_t pad_w = padding[1];
+  int32_t stride_h = stride[0];
+  int32_t stride_w = stride[1];
+
+  // If we were to apply a convolution on the input tensor, compute the output
+  // height and width.
+  int32_t out_h =
+      (in_h + 2 * pad_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
+  int32_t out_w =
+      (in_w + 2 * pad_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
+
+  ET_DCHECK_MSG(
+      (out_h * out_w) == out.size(1), "dimension mismatch for output");
+  ET_DCHECK_MSG(
+      (kernel_h * kernel_w * in_c) == out.size(2),
+      "dimension mismatch for output");
+      
+  bool optimized = false;
+  if(input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte)
+    optimized = true;
+
+  if(!optimized) {
+    WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
+          ctx,
+          ((batch_size * in_c * in_h * in_w) + 8) *
+              sizeof(WORD8));
+
+    WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);              
+
+    WORD32 p_inp_shape[4];
+    p_inp_shape[0] = input.size(0);
+    p_inp_shape[1] = in_c;
+    p_inp_shape[2] = in_h;
+    p_inp_shape[3] = in_w;
+
+    WORD32 p_out_shape[4];
+    p_out_shape[0] = input.size(0);
+    p_out_shape[1] = in_h;
+    p_out_shape[2] = in_w;
+    p_out_shape[3] = in_c;
+    
+    WORD32 p_permute_vec[4] = {0, 2, 3, 1};
+
+    WORD8* __restrict__ p_inp =
+          (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+
+    xa_nn_transpose_8_8(
+            pin,
+            p_out_shape,
+            p_inp,
+            p_inp_shape,
+            p_permute_vec,
+            4, // input dimensions
+            4); // output dimensions
+
+    const int8_t* __restrict__ in_data = pin;
+    int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();                            
+    int32_t in_plane = in_c * in_h * in_w;                             
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    
+    for (size_t n = 0; n < batch_size; ++n) {  
+        xa_nn_im2row_quantized(                                                 
+          &in_data[n * in_plane],                                      
+          (int32_t)in_zero_point,        
+          in_c,                                                        
+          in_h,                                                        
+          in_w,                                                        
+          out_h,                                                       
+          out_w,                                                       
+          kernel_h,                                                    
+          kernel_w,                                                    
+          pad_h,                                                       
+          pad_w,                                                       
+          stride_h,                                                    
+          stride_w,                                                    
+          dilation_h,                                                  
+          dilation_w,                                                  
+          &out_data[n * out_plane],                                    
+          1/*channel_last*/);
+    }
+  }
+  else {
+#define typed_im2row_per_tensor(dtype, ctype)                          \
+  case ScalarType::dtype: {                                            \
+    const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
+    ctype* __restrict__ out_data = out.mutable_data_ptr<ctype>();      \
+    int32_t in_plane = in_c * in_h * in_w;                             \
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    \
+    for (size_t n = 0; n < batch_size; ++n) {                          \
+      im2row_<ctype>(                                                  \
+          &in_data[n * in_plane],                                      \
+          in_zero_point,                                               \
+          in_c,                                                        \
+          in_h,                                                        \
+          in_w,                                                        \
+          out_h,                                                       \
+          out_w,                                                       \
+          kernel_h,                                                    \
+          kernel_w,                                                    \
+          pad_h,                                                       \
+          pad_w,                                                       \
+          stride_h,                                                    \
+          stride_w,                                                    \
+          dilation_h,                                                  \
+          dilation_w,                                                  \
+          &out_data[n * out_plane],                                    \
+          channel_last);                                               \
+    }                                                                  \
+    break;                                                             \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    typed_im2row_per_tensor(Float, float);
+    typed_im2row_per_tensor(Byte, uint8_t);
+    typed_im2row_per_tensor(Char, int8_t);
+    default:
+      ET_DCHECK_MSG(
+          false,
+          "im2row.per_tensor not implemented for dtype %s",
+          torch::executor::toString(dtype));
+  }
+#undef typed_im2row_per_tensor
+  }
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
new file mode 100644
index 00000000000..3746991d430
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
@@ -0,0 +1,133 @@
+#include "xa_type_def.h"
+#include "xa_nnlib_common_fpu.h"
+#include "xa_nn_common.h"
+#include "xa_nnlib_err_chk.h"
+//#include "xa_nn_basic_state.h"
+#include "xa_nnlib_kernels_api.h"  
+
+WORD32 xa_nn_im2row_quantized(
+    const WORD8* __restrict__ data_im,
+    const WORD32 in_zero_point,
+    /* input parameters*/
+    const WORD32 channels,
+    const WORD32 height,
+    const WORD32 width,
+    /* output parameters */
+    const WORD32 out_height,
+    const WORD32 out_width,
+    /* convolution parameters */
+    const WORD32 kernel_h,
+    const WORD32 kernel_w,
+    const WORD32 pad_h,
+    const WORD32 pad_w,
+    const WORD32 stride_h,
+    const WORD32 stride_w,
+    const WORD32 dilation_h,
+    const WORD32 dilation_w,
+    WORD8* __restrict__ data_col,
+    WORD32 channels_last) 
+{
+    const WORD32 channels_col = channels * kernel_h * kernel_w;
+
+  // If the layout is NHWC, we can copy 'channels' worth of contiguous data
+  // points when performing im2row.
+  if (channels_last) {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+        // Each point in the output domain is the result of applying a filter of
+        // size kernel_h x kernel_w x channels on the input. But since channels
+        // is contiguous, we will not explicitly have a loop for it.
+        for (int _kh = 0; _kh < kernel_h; ++_kh) {
+            int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+          for (int _kw = 0; _kw < kernel_w; ++_kw) {
+            int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+
+            // h_im and w_im are the actual height and width coordinates of the
+            // input tensor from where we need to copy 'channels' points.
+            const int8_t* __restrict__ slice_im =
+                data_im + (h_im * width + w_im) * channels;
+            int8_t* __restrict__ slice_col = data_col + i_col * channels_col +
+                (_kh * kernel_w + _kw) * channels;
+            // If the coordinates were within the input domain, we copy
+            // 'channels' contiguous values. Otherwise we will fill the output
+            // with 0's.
+            if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+              const ae_int24x2 *pae_inp = (const ae_int24x2 *)slice_im;
+              ae_int24x2 *pae_out = (ae_int24x2 *)slice_col;
+              ae_valign inp_a, out_a;
+              inp_a = AE_LA64_PP(pae_inp);
+              out_a = AE_ZALIGN64();
+
+              int ic;
+              for(ic = 0; ic < channels; ic += 6)
+              {
+                ae_int24x2 d0;
+                AE_LA24X2_IP(d0, inp_a, pae_inp);
+                AE_SA24X2_IP(d0, out_a, pae_out);
+              }
+              AE_SA64POS_FP(out_a, pae_out);
+              for(int i = ic; i < (channels & 5); i++)
+              {
+                slice_col[i] = slice_im[i];
+              }
+            } 
+            else {
+              ae_int24x2 *pae_out = (ae_int24x2 *)slice_col;
+              ae_valign out_a;
+              out_a = AE_ZALIGN64();
+
+              ae_int32x2 tmp = AE_MOVDA32(in_zero_point);
+              ae_int32x2 in_zero_point32x2 = AE_SLLI32(tmp, 8);
+              in_zero_point32x2 = AE_OR32(in_zero_point32x2, tmp);
+              in_zero_point32x2 = AE_SLLI32(in_zero_point32x2, 8);
+              in_zero_point32x2 = AE_OR32(in_zero_point32x2, in_zero_point32x2);
+
+              ae_int24x2 d0 = AE_MOVINT24X2_FROMINT32X2(in_zero_point32x2);
+              int ic;
+              for(ic = 0; ic < channels; ic += 6)
+              {
+                AE_SA24X2_IP(d0, out_a, pae_out);
+              }
+              AE_SA64POS_FP(out_a, pae_out);
+              for(int i = ic; ic < (channels & 5); i++)
+              {
+                slice_col[i] = (int8_t)(in_zero_point);
+              }
+            }
+          }
+        }
+      }
+    }
+  } else {
+    // Iterate over the output domain
+    for (int _h = 0; _h < out_height; ++_h) {
+      for (int _w = 0; _w < out_width; ++_w) {
+        int32_t i_col = _h * out_width + _w;
+
+        // Each point in the output domain is the result of applying a filter
+        // of size chanenls * kernel_h x kernel_w on the input
+        for (int _c = 0; _c < channels; ++_c) {
+          for (int _kh = 0; _kh < kernel_h; ++_kh) {
+            for (int _kw = 0; _kw < kernel_w; ++_kw) {
+              // c_col is the linearized access in the channels_col vector.
+              int32_t c_col = (_c * kernel_h + _kh) * kernel_w + _kw;
+              // h_im and w_im are the actual height and width coordinates of
+              // the input tensor that we need to copy to the output.
+              int32_t h_im = _h * stride_h - pad_h + _kh * dilation_h;
+              int32_t w_im = _w * stride_w - pad_w + _kw * dilation_w;
+              // If the current data access is within the input tensor, copy the
+              // value
+              data_col[i_col * channels_col + c_col] =
+                  (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
+                  ? data_im[(_c * height + h_im) * width + w_im]
+                  : (int8_t)(in_zero_point);
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}

From 589e14b6ea7e34cdfb025d4440df72bb230c0526 Mon Sep 17 00:00:00 2001
From: dijopaul <dijopaul@cadence.com>
Date: Thu, 17 Jul 2025 09:26:19 -0700
Subject: [PATCH 4/4] Fixing lint error

---
 .../cadence/hifi/operators/im2row_out.cpp     | 202 +++++++++---------
 1 file changed, 99 insertions(+), 103 deletions(-)

diff --git a/backends/cadence/hifi/operators/im2row_out.cpp b/backends/cadence/hifi/operators/im2row_out.cpp
index e60c747c9e8..4793a36fec4 100644
--- a/backends/cadence/hifi/operators/im2row_out.cpp
+++ b/backends/cadence/hifi/operators/im2row_out.cpp
@@ -167,16 +167,15 @@ void im2row_out(
   bool per_tensor_quantized = in_zero_point.numel() == 1;
 
   bool optimized = false;
-  if(input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte)
+  if (input.scalar_type() == ScalarType::Char ||
+      input.scalar_type() == ScalarType::Byte)
     optimized = true;
 
-  if(!optimized) {
+  if (!optimized) {
     WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
-          ctx,
-          ((batch_size * in_c * in_h * in_w) + 8) *
-              sizeof(WORD8));
+        ctx, ((batch_size * in_c * in_h * in_w) + 8) * sizeof(WORD8));
 
-    WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);              
+    WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);
 
     WORD32 p_inp_shape[4];
     p_inp_shape[0] = input.size(0);
@@ -189,49 +188,48 @@ void im2row_out(
     p_out_shape[1] = in_h;
     p_out_shape[2] = in_w;
     p_out_shape[3] = in_c;
-    
+
     WORD32 p_permute_vec[4] = {0, 2, 3, 1};
 
     WORD8* __restrict__ p_inp =
-          (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+        (WORD8* __restrict__)input.const_data_ptr<int8_t>();
 
     xa_nn_transpose_8_8(
-            pin,
-            p_out_shape,
-            p_inp,
-            p_inp_shape,
-            p_permute_vec,
-            4, // input dimensions
-            4); // output dimensions
+        pin,
+        p_out_shape,
+        p_inp,
+        p_inp_shape,
+        p_permute_vec,
+        4, // input dimensions
+        4); // output dimensions
 
     const int8_t* __restrict__ in_data = pin;
-    int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();      
-    const int32_t* __restrict__ zero_point =                           
-        in_zero_point.const_data_ptr<int32_t>();                       
-    int32_t in_plane = in_c * in_h * in_w;                             
-    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    
-    for (size_t n = 0; n < batch_size; ++n) {  
-        xa_nn_im2row_quantized(                                                 
-          &in_data[n * in_plane],                                      
-          per_tensor_quantized ? zero_point[0] : zero_point[n],        
-          in_c,                                                        
-          in_h,                                                        
-          in_w,                                                        
-          out_h,                                                       
-          out_w,                                                       
-          kernel_h,                                                    
-          kernel_w,                                                    
-          pad_h,                                                       
-          pad_w,                                                       
-          stride_h,                                                    
-          stride_w,                                                    
-          dilation_h,                                                  
-          dilation_w,                                                  
-          &out_data[n * out_plane],                                    
-          1/*channel_last*/);
+    int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();
+    const int32_t* __restrict__ zero_point =
+        in_zero_point.const_data_ptr<int32_t>();
+    int32_t in_plane = in_c * in_h * in_w;
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;
+    for (size_t n = 0; n < batch_size; ++n) {
+      xa_nn_im2row_quantized(
+          &in_data[n * in_plane],
+          per_tensor_quantized ? zero_point[0] : zero_point[n],
+          in_c,
+          in_h,
+          in_w,
+          out_h,
+          out_w,
+          kernel_h,
+          kernel_w,
+          pad_h,
+          pad_w,
+          stride_h,
+          stride_w,
+          dilation_h,
+          dilation_w,
+          &out_data[n * out_plane],
+          1 /*channel_last*/);
     }
-  }
-  else {
+  } else {
 #define typed_im2row(dtype, ctype)                                     \
   case ScalarType::dtype: {                                            \
     const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
@@ -263,17 +261,17 @@ void im2row_out(
     break;                                                             \
   }
 
-  ScalarType dtype = input.scalar_type();
-  switch (dtype) {
-    typed_im2row(Float, float);
-    typed_im2row(Byte, uint8_t);
-    typed_im2row(Char, int8_t);
-    default:
-      ET_DCHECK_MSG(
-          false,
-          "im2row not implemented for dtype %s",
-          torch::executor::toString(dtype));
-  }
+    ScalarType dtype = input.scalar_type();
+    switch (dtype) {
+      typed_im2row(Float, float);
+      typed_im2row(Byte, uint8_t);
+      typed_im2row(Char, int8_t);
+      default:
+        ET_DCHECK_MSG(
+            false,
+            "im2row not implemented for dtype %s",
+            torch::executor::toString(dtype));
+    }
 #undef typed_im2row
   }
 }
@@ -320,18 +318,17 @@ void im2row_per_tensor_out(
   ET_DCHECK_MSG(
       (kernel_h * kernel_w * in_c) == out.size(2),
       "dimension mismatch for output");
-      
+
   bool optimized = false;
-  if(input.scalar_type() == ScalarType::Char || input.scalar_type() == ScalarType::Byte)
+  if (input.scalar_type() == ScalarType::Char ||
+      input.scalar_type() == ScalarType::Byte)
     optimized = true;
 
-  if(!optimized) {
+  if (!optimized) {
     WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
-          ctx,
-          ((batch_size * in_c * in_h * in_w) + 8) *
-              sizeof(WORD8));
+        ctx, ((batch_size * in_c * in_h * in_w) + 8) * sizeof(WORD8));
 
-    WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);              
+    WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);
 
     WORD32 p_inp_shape[4];
     p_inp_shape[0] = input.size(0);
@@ -344,47 +341,46 @@ void im2row_per_tensor_out(
     p_out_shape[1] = in_h;
     p_out_shape[2] = in_w;
     p_out_shape[3] = in_c;
-    
+
     WORD32 p_permute_vec[4] = {0, 2, 3, 1};
 
     WORD8* __restrict__ p_inp =
-          (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+        (WORD8* __restrict__)input.const_data_ptr<int8_t>();
 
     xa_nn_transpose_8_8(
-            pin,
-            p_out_shape,
-            p_inp,
-            p_inp_shape,
-            p_permute_vec,
-            4, // input dimensions
-            4); // output dimensions
+        pin,
+        p_out_shape,
+        p_inp,
+        p_inp_shape,
+        p_permute_vec,
+        4, // input dimensions
+        4); // output dimensions
 
     const int8_t* __restrict__ in_data = pin;
-    int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();                            
-    int32_t in_plane = in_c * in_h * in_w;                             
-    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;    
-    for (size_t n = 0; n < batch_size; ++n) {  
-        xa_nn_im2row_quantized(                                                 
-          &in_data[n * in_plane],                                      
-          (int32_t)in_zero_point,        
-          in_c,                                                        
-          in_h,                                                        
-          in_w,                                                        
-          out_h,                                                       
-          out_w,                                                       
-          kernel_h,                                                    
-          kernel_w,                                                    
-          pad_h,                                                       
-          pad_w,                                                       
-          stride_h,                                                    
-          stride_w,                                                    
-          dilation_h,                                                  
-          dilation_w,                                                  
-          &out_data[n * out_plane],                                    
-          1/*channel_last*/);
+    int8_t* __restrict__ out_data = out.mutable_data_ptr<int8_t>();
+    int32_t in_plane = in_c * in_h * in_w;
+    int32_t out_plane = kernel_h * kernel_w * in_c * out_h * out_w;
+    for (size_t n = 0; n < batch_size; ++n) {
+      xa_nn_im2row_quantized(
+          &in_data[n * in_plane],
+          (int32_t)in_zero_point,
+          in_c,
+          in_h,
+          in_w,
+          out_h,
+          out_w,
+          kernel_h,
+          kernel_w,
+          pad_h,
+          pad_w,
+          stride_h,
+          stride_w,
+          dilation_h,
+          dilation_w,
+          &out_data[n * out_plane],
+          1 /*channel_last*/);
     }
-  }
-  else {
+  } else {
 #define typed_im2row_per_tensor(dtype, ctype)                          \
   case ScalarType::dtype: {                                            \
     const ctype* __restrict__ in_data = input.const_data_ptr<ctype>(); \
@@ -414,17 +410,17 @@ void im2row_per_tensor_out(
     break;                                                             \
   }
 
-  ScalarType dtype = input.scalar_type();
-  switch (dtype) {
-    typed_im2row_per_tensor(Float, float);
-    typed_im2row_per_tensor(Byte, uint8_t);
-    typed_im2row_per_tensor(Char, int8_t);
-    default:
-      ET_DCHECK_MSG(
-          false,
-          "im2row.per_tensor not implemented for dtype %s",
-          torch::executor::toString(dtype));
-  }
+    ScalarType dtype = input.scalar_type();
+    switch (dtype) {
+      typed_im2row_per_tensor(Float, float);
+      typed_im2row_per_tensor(Byte, uint8_t);
+      typed_im2row_per_tensor(Char, int8_t);
+      default:
+        ET_DCHECK_MSG(
+            false,
+            "im2row.per_tensor not implemented for dtype %s",
+            torch::executor::toString(dtype));
+    }
 #undef typed_im2row_per_tensor
   }
 }