NVIDIA · luitjens · Aug 25, 2022 · Aug 25, 2022
diff --git a/bench/00_transform/conv.cu b/bench/00_transform/conv.cu
@@ -4,7 +4,7 @@
 using namespace matx;
 
 using conv_types =
-    nvbench::type_list<cuda::std::complex<float>, cuda::std::complex<double>>;
+    nvbench::type_list<cuda::std::complex<float>, cuda::std::complex<double>, float, double>;
 
 /* FFT benchmarks */
 template <typename ValueType>
@@ -44,4 +44,4 @@ void conv1d_2d_batch(nvbench::state &state,
   state.exec(
       [&out, &at, &bt](nvbench::launch &launch) { conv1d(out, at, bt, MATX_C_MODE_FULL, launch.get_stream()); });
 }
-NVBENCH_BENCH_TYPES(conv1d_2d_batch, NVBENCH_TYPE_AXES(conv_types));
+NVBENCH_BENCH_TYPES(conv1d_2d_batch, NVBENCH_TYPE_AXES(conv_types));
diff --git a/include/matx/kernels/conv.cuh b/include/matx/kernels/conv.cuh
@@ -44,7 +44,7 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
   using outtype_strip = typename OutType::scalar_type;
   int chunk_idx = blockIdx.y;
   int batch_idx = blockIdx.x;
-  index_t filter_len = d_filter.Size(Rank-1);
+  int32_t filter_len = d_filter.Size(Rank-1);
 
   // All but the last dim will be populated
   auto bdims = BlockToIdx(d_in, batch_idx, 1);
@@ -62,13 +62,13 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
   if constexpr (std::alignment_of_v < intype_strip >>
                 std::alignment_of_v<ftype_strip>) {
     s_data =
-        matx::detail::AlignAddr<intype_strip>((uint8_t *)&s_exch[static_cast<index_t>(
+        matx::detail::AlignAddr<intype_strip>((uint8_t *)&s_exch[static_cast<int32_t>(
             filter_len * filt_size_adj)]); // Start data portion after 2x the
                                            // filter to remove conditionals and
                                            // multiply by 0
   }
   else {
-    s_data = reinterpret_cast<intype_strip *>(&s_exch[static_cast<index_t>(
+    s_data = reinterpret_cast<intype_strip *>(&s_exch[static_cast<int32_t>(
         filter_len *
         filt_size_adj)]); // Start data portion after 2x the filter to
                           // remove conditionals and multiply by 0
@@ -80,7 +80,7 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
   // duplicate tids based on this formula, but not all threads write out to
   // memory. Some are only there to fetch data, while others both fetch and
   // compute output
-  const index_t tid =
+  const int32_t tid =
       static_cast<index_t>(chunk_idx) * (blockDim.x - filter_len + 1) +
       threadIdx.x;
   int offset = tid - filter_len + 1;
@@ -89,7 +89,7 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
 
   // Zero out shared memory since it's used later to index into where we want
   // 0-valued taps
-  for (index_t i = threadIdx.x; i < filter_len + blockDim.x; i += blockDim.x) {
+  for (int32_t i = threadIdx.x; i < filter_len + blockDim.x; i += blockDim.x) {
     s_data[i] = 0.0;
   }
 
@@ -135,10 +135,19 @@ __global__ void Conv1D(OutType d_out, InType d_in, FilterType d_filter,
   // data in shared memory for blockDim-filt_len+1 to operate on. The rest sit
   // idle through this process.
   if (tid < full_len && (threadIdx.x < blockDim.x - filter_len + 1)) {
+#if 0
 #pragma unroll
     for (index_t r = 0; r < filter_len; r++) {
       val = val + s_filter[r] * s_data[threadIdx.x + filter_len - 1 - r];
     }
+#else
+    s_data += threadIdx.x + filter_len - 1;
+    for (int32_t r = 0; r < filter_len; r++) {
+      val = val + s_filter[0] * s_data[0];
+      s_data--;
+      s_filter++;
+    }
+#endif
 
     if (mode == MATX_C_MODE_FULL) {
       bdims[Rank - 1] = tid;  

diff --git a/include/matx/operators/collapse.h b/include/matx/operators/collapse.h
@@ -50,7 +50,7 @@ namespace matx
         using matxop = bool;
         using scalar_type = typename T1::scalar_type;
         using shape_type = typename T1::shape_type;
-        using matxlvalue = bool;
+        using matxoplvalue = bool;
 
         __MATX_INLINE__ LCollapseOp(const T1 &op) : op_(op)
       {
@@ -61,7 +61,7 @@ namespace matx
         // comptue size of collapsed dimension
         size_ = 1;
 
-        // Collapse right-most dims
+        // Collapse left-most dims
 #pragma unroll
         for(int i = 0 ; i <= DIM; i++) {
           size_ *= op_.Size(i);
@@ -85,7 +85,32 @@ namespace matx
             auto ind = in[0];
 #pragma unroll
             for(int i = 0; i <= DIM; i++) {
-              index_t d = DIM - i;
+              int d = DIM - i;
+              out[d] = ind % op_.Size(d);
+              ind /= op_.Size(d);
+            }
+
+            return mapply(op_, out);
+          }    
+
+	template <typename... Is>
+          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto& operator()(Is... indices) 
+          {
+            // indices coming in
+            std::array<index_t, Rank()> in{indices...};  // index coming in
+            std::array<index_t, T1::Rank()> out;         // index going out
+
+#pragma unroll
+            for(int i = 1; i < Rank(); i++) {
+              // copy all but first input index into out array
+              out[DIM+i] = in[i];
+            }
+
+            // expand first input index into DIM indices
+            auto ind = in[0];
+#pragma unroll
+            for(int i = 0; i <= DIM; i++) {
+              int d = DIM - i;
               out[d] = ind % op_.Size(d);
               ind /= op_.Size(d);
             }
@@ -168,6 +193,31 @@ namespace matx
             std::array<index_t, Rank()> in{indices...};  // index coming in
             std::array<index_t, T1::Rank()> out;         // index going out
 
+#pragma unroll
+            for(int i = 0 ; i < Rank() - 1; i++) {
+              // copy all but last index into out array
+              out[i] = in[i];
+            }
+
+            // expand last index into DIM indices
+            auto ind = in[Rank() - 1];
+#pragma unroll
+            for(int i = 0; i <= DIM; i++) {
+              index_t d = T1::Rank() - 1 - i;
+              out[d] = ind % op_.Size(d);
+              ind /= op_.Size(d);
+            }
+
+            return mapply(op_, out);
+          }    
+
+	template <typename... Is>
+          __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto& operator()(Is... indices)
+          {
+            // indices coming in
+            std::array<index_t, Rank()> in{indices...};  // index coming in
+            std::array<index_t, T1::Rank()> out;         // index going out
+
 #pragma unroll
             for(int i = 0 ; i < Rank() - 1; i++) {
               // copy all but last index into out array

diff --git a/include/matx/transforms/conv.h b/include/matx/transforms/conv.h
@@ -166,7 +166,7 @@ inline void conv1d_impl(OutputType &o, const In1Type &i1, const In2Type &i2,
  * @param stream CUDA stream
  */
 template <typename OutputType, typename In1Type, typename In2Type>
-inline void conv1d(OutputType &&o, const In1Type &i1, const In2Type &i2,
+inline void conv1d(OutputType o, const In1Type &i1, const In2Type &i2,
                    matxConvCorrMode_t mode, cudaStream_t stream) {
   if constexpr ( In1Type::Rank() >  In2Type::Rank() ) {
     // broadcast i2 path.  clone i2 across batches

diff --git a/test/00_operators/OperatorTests.cu b/test/00_operators/OperatorTests.cu
@@ -614,7 +614,7 @@ TYPED_TEST(OperatorTestsNumericNonComplex, CollapseOp)
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
         for(int k = 0; k < K; k++) {
-          EXPECT_TRUE(tiv(n,m,k) == tov(n,m*K+k));
+          ASSERT_TRUE(tiv(n,m,k) == tov(n,m*K+k));
         }
       }
     }
@@ -637,7 +637,7 @@ TYPED_TEST(OperatorTestsNumericNonComplex, CollapseOp)
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
         for(int k = 0; k < K; k++) {
-          EXPECT_TRUE(tiv(n,m,k) == tov(n*M+m,k));
+          ASSERT_TRUE(tiv(n,m,k) == tov(n*M+m,k));
         }
       }
     }
@@ -658,7 +658,7 @@ TYPED_TEST(OperatorTestsNumericNonComplex, CollapseOp)
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
         for(int k = 0; k < K; k++) {
-          EXPECT_TRUE(tiv(n,m,k) == tov(n*M*K+m*K+k));
+          ASSERT_TRUE(tiv(n,m,k) == tov(n*M*K+m*K+k));
         }
       }
     }
@@ -679,7 +679,7 @@ TYPED_TEST(OperatorTestsNumericNonComplex, CollapseOp)
     for(int n = 0; n < N; n++) {
       for(int m = 0; m < M; m++) {
         for(int k = 0; k < K; k++) {
-          EXPECT_TRUE(tiv(n,m,k) == tov(n*M*K+m*K+k));
+          ASSERT_TRUE(tiv(n,m,k) == tov(n*M*K+m*K+k));
         }
       }
     }
@@ -2618,6 +2618,169 @@ TEST(OperatorTests, Cast)
   MATX_EXIT_HANDLER();
 }
 
+TEST(OperatorTestsAdvanced, AdvancedRemapOp)
+{
+  typedef cuda::std::complex<float> complex;
+  MATX_ENTER_HANDLER();
+
+  int I = 4;
+  int J = 4;
+  int K = 14;
+  int L = 133;
+
+  int F = 4096;
+  int P = 288;
+
+  int M = 2;
+
+  auto idx = matx::make_tensor<int, 1>({M});
+
+  idx(0) = 1;
+  idx(1) = 3;
+
+  auto A = matx::make_tensor<complex, 4>({I, J, K, L});
+  //collapsed tensor
+  auto B = matx::make_tensor<complex, 2>({I * M * K, L});
+
+  auto index = [&] (int i, int j, int k, int l) {
+    return i * J * K * L +
+      j * K * L +
+      k * L +
+      l;
+  };
+  for (int i = 0; i < I ; i++) {
+    for (int j = 0; j < J ; j++) {
+      for (int k = 0; k < K ; k++) {
+        for (int l = 0; l < L ; l++) {
+          float val = (float)index(i,j,k,l);
+          A(i,j,k,l) = complex(val, val/100);
+        }
+      }
+    }
+  }
+
+  (B = 0).run();
+
+  auto rop = remap<1>(A, idx);
+  auto lop = lcollapse<2>(rop);
+
+  ASSERT_EQ(lop.Rank() , 2);
+  ASSERT_EQ(lop.Size(1) , A.Size(3));
+  ASSERT_EQ(lop.Size(0) , I * M * K);
+
+  (B = lop).run();
+
+  cudaDeviceSynchronize();  
+
+  for (int i = 0; i < I; i++) {
+    for (int m = 0; m < M; m++) {
+      for (int k = 0; k < K; k++) {
+        for (int l = 0; l < L; l++) {
+          int j = idx(m);
+          int fidx = i * M * K + m * K  + k;
+          float val = (float)index(i,j,k,l);
+          complex expected_val = complex(val,val/100);
+          complex a_val = A(i,j,k,l);
+          complex b_val = B(fidx, l);	  
+          complex lop_val = lop(fidx, l);
+          complex rop_val = rop(i, m, k, l);
+
+          //	  printf("fidx: %d, i: %d, j: %d, k: %d, l: %d, val: %f,%f\n", fidx, i, j, k, l, val, val/100);
+          //	  printf("a_val: %f, %f, rop_val: %f, %f, lop_val: %f, %f, b_val: %f, %f\n",
+          //			  a_val.real(), a_val.imag(),
+          //			 rop_val.real(), rop_val.imag(),
+          //			lop_val.real(), lop_val.imag(),
+          //		       b_val.real(), b_val.imag());
+          ASSERT_EQ(a_val, expected_val);
+          ASSERT_EQ(rop_val, expected_val);
+          ASSERT_EQ(lop_val, expected_val);
+          ASSERT_EQ(b_val, expected_val);
+
+          ASSERT_EQ(B(fidx, l) , lop(fidx, l));
+        }
+      }
+    }
+  }
+
+
+  // convolution test
+  auto O1 = matx::make_tensor<complex, 4>({I, J, K, F + P + L - 1});
+  auto O2 = matx::make_tensor<complex, 4>({I, J, K, F + P + L - 1});
+  auto O3 = matx::make_tensor<complex, 4>({I, J, K, F + P + L - 1});
+  auto O4 = matx::make_tensor<complex, 4>({I, J, K, F + P + L - 1});
+
+  auto C = matx::make_tensor<complex, 3>({I, K, F + P});
+  //collapsed tensor
+  auto D = matx::make_tensor<complex, 2>({I * M * K, F + P});
+
+  auto indexc = [&] (int i, int j, int k) {
+    return i * C.Size(1) * C.Size(2) +
+      j * C.Size(2) +
+      k;
+  };
+
+  for (int i = 0; i < I ; i++) {
+    for (int j = 0; j < J ; j++) {
+      for (int k = 0; k < K ; k++) {
+        float val = (float) indexc(i,j,k);
+        C(i,j,k) = complex(val, val/100);
+      }
+    }
+  }
+
+  A.PrefetchDevice(0);
+  B.PrefetchDevice(0);
+  C.PrefetchDevice(0);
+  D.PrefetchDevice(0);
+  O1.PrefetchDevice(0);
+  O2.PrefetchDevice(0);
+  O3.PrefetchDevice(0);
+  O4.PrefetchDevice(0);
+
+  cudaDeviceSynchronize();
+
+  auto o1op = lcollapse<2>(remap<1>(O1, idx));
+  auto o2op = lcollapse<2>(remap<1>(O2, idx));
+  auto o3op = lcollapse<2>(remap<1>(O3, idx));
+  auto o4op = lcollapse<2>(remap<1>(O4, idx));
+
+  auto cop = C.Clone<4>({matxKeepDim, M, matxKeepDim, matxKeepDim});
+  auto rcop = lcollapse<2>(remap<1>(cop, idx));
+
+  (O1 = 1).run();
+  (O2 = 2).run();
+  (O3 = 3).run();
+  (O4 = 4).run();
+
+  (B = lop).run();
+  (D = rcop).run();
+
+  // two operators as input
+  matx::conv1d(o1op, lop, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL, 0);
+
+  // one tensor and one operators as input
+  matx::conv1d(o2op, B, rcop, matx::matxConvCorrMode_t::MATX_C_MODE_FULL, 0);
+
+  // one tensor and one operators as input
+  matx::conv1d(o3op, lop, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL, 0);
+
+  //two tensors as input
+  matx::conv1d(o4op, B, D, matx::matxConvCorrMode_t::MATX_C_MODE_FULL, 0);
+
+  cudaDeviceSynchronize();
+
+  for (int i = 0; i < o1op.Size(0); i++) {
+    for (int l = 0; l < o1op.Size(1); l++) {
+      ASSERT_EQ(o1op(i,l), o2op(i,l));
+      ASSERT_EQ(o2op(i,l), o3op(i,l));
+      ASSERT_EQ(o3op(i,l), o4op(i,l));
+    }
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+
 TYPED_TEST(OperatorTestsFloat, Print)
 {
   MATX_ENTER_HANDLER();