davisking · davisking · Dec 20, 2024 · Sep 16, 2024 · Sep 20, 2024 · Sep 23, 2024
diff --git a/dlib/cuda/cublas_dlibapi.cpp b/dlib/cuda/cublas_dlibapi.cpp
@@ -101,55 +101,118 @@ namespace dlib
             const tensor& lhs,
             bool trans_lhs,
             const tensor& rhs,
-            bool trans_rhs
+            bool trans_rhs,
+            size_t g_mode
         )
         {
-            // Recall that BLAS uses column major order so to deal with that we flip the
-            // order of the lhs and rhs arguments.
-            const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
-            const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-            const int dest_nr = dest.num_samples();
-            const int dest_nc = dest.size()/dest_nr;
-            const int lhs_nr = lhs.num_samples();
-            const int lhs_nc = lhs.size()/lhs_nr;
-            const int rhs_nr = rhs.num_samples();
-            const int rhs_nc = rhs.size()/rhs_nr;
-            if (trans_lhs && trans_rhs)
+            if (g_mode == 0) // gemm_mode::CHANNEL_WISE
             {
-                DLIB_ASSERT( dest_nr == lhs_nc &&
-                              dest_nc == rhs_nr &&
-                              lhs_nr == rhs_nc)
-            }
-            else if (!trans_lhs && trans_rhs)
-            {
-                DLIB_ASSERT( dest_nr == lhs_nr &&
-                              dest_nc == rhs_nr &&
-                              lhs_nc == rhs_nc)
-            }
-            else if (trans_lhs && !trans_rhs)
-            {
-                DLIB_ASSERT( dest_nr == lhs_nc &&
-                              dest_nc == rhs_nc &&
-                              lhs_nr == rhs_nr)
+                // Recall that BLAS uses column major order so to deal with that we flip the
+                // order of the lhs and rhs arguments.
+                const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+                const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+                const int dest_nr = dest.num_samples();
+                const int dest_nc = dest.size() / dest_nr;
+                const int lhs_nr = lhs.num_samples();
+                const int lhs_nc = lhs.size() / lhs_nr;
+                const int rhs_nr = rhs.num_samples();
+                const int rhs_nc = rhs.size() / rhs_nr;
+                if (trans_lhs && trans_rhs)
+                {
+                    DLIB_ASSERT(dest_nr == lhs_nc &&
+                        dest_nc == rhs_nr &&
+                        lhs_nr == rhs_nc)
+                }
+                else if (!trans_lhs && trans_rhs)
+                {
+                    DLIB_ASSERT(dest_nr == lhs_nr &&
+                        dest_nc == rhs_nr &&
+                        lhs_nc == rhs_nc)
+                }
+                else if (trans_lhs && !trans_rhs)
+                {
+                    DLIB_ASSERT(dest_nr == lhs_nc &&
+                        dest_nc == rhs_nc &&
+                        lhs_nr == rhs_nr)
+                }
+                else
+                {
+                    DLIB_ASSERT(dest_nr == lhs_nr &&
+                        dest_nc == rhs_nc &&
+                        lhs_nc == rhs_nr)
+                }
+
+                const int k = trans_rhs ? rhs_nc : rhs_nr;
+                CHECK_CUBLAS(cublasSgemm(context(),
+                    transb,
+                    transa,
+                    dest_nc, dest_nr, k,
+                    &alpha,
+                    rhs.device(), rhs_nc,
+                    lhs.device(), lhs_nc,
+                    &beta,
+                    dest.device(), dest_nc));
             }
-            else
+            else if (g_mode == 1) // gemm_mode::PLANE_WISE
             {
-                DLIB_ASSERT( dest_nr == lhs_nr &&
-                              dest_nc == rhs_nc &&
-                              lhs_nc == rhs_nr)
-            }
+                const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+                const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+                long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() });
+                long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() });
+
+                auto is_matrix = [](const auto& tensor) {
+                    return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) ||
+                        (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1));
+                };
+                const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest);
+
+                if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) num_samples = num_channels = 1;
+
+                size_t lhs_rows = lhs.nr();
+                size_t lhs_cols = lhs.nc();
+                if (lhs_is_matrix && (lhs.num_samples() > 1 || lhs.k() > 1)) {
+                    lhs_rows = lhs.num_samples();
+                    lhs_cols = lhs.k();
+                }
+                size_t rhs_rows = rhs.nr();
+                size_t rhs_cols = rhs.nc();
+                if (rhs_is_matrix && (rhs.num_samples() > 1 || rhs.k() > 1)) {
+                    rhs_rows = rhs.num_samples();
+                    rhs_cols = rhs.k();
+                }
+                size_t dest_rows = dest.nr();
+                size_t dest_cols = dest.nc();
+                if (dest_is_matrix && (dest.num_samples() > 1 || dest.k() > 1)) {
+                    dest_rows = dest.num_samples();
+                    dest_cols = dest.k();
+                }
+
+                const size_t lhs_plane_size = lhs_rows * lhs_cols;
+                const size_t rhs_plane_size = rhs_rows * rhs_cols;
+                const size_t dest_plane_size = dest_rows * dest_cols;
 
-            const int k = trans_rhs ? rhs_nc : rhs_nr;
-            CHECK_CUBLAS(cublasSgemm(context(),
-                              transb,
-                              transa, 
-                              dest_nc, dest_nr, k,
-                              &alpha,
-                              rhs.device(), rhs_nc,
-                              lhs.device(), lhs_nc,
-                              &beta,
-                              dest.device(),dest_nc));
+                for (long b = 0; b < num_samples; ++b)
+                {
+                    for (long c = 0; c < num_channels; ++c)
+                    {
+                        auto lhs_slice = lhs_is_matrix ? lhs.device() :
+                            lhs.device() + (b * num_channels + c) * lhs_plane_size;
+                        auto rhs_slice = rhs_is_matrix ? rhs.device() :
+                            rhs.device() + (b * num_channels + c) * rhs_plane_size;
+                        auto dest_slice = dest_is_matrix ? dest.device() :
+                            dest.device() + (b * num_channels + c) * dest_plane_size;
+                        const int k = trans_rhs ? rhs_cols : rhs_rows;
+
+                        CHECK_CUBLAS(cublasSgemm(
+                            context(), transb, transa, dest_cols, dest_rows, k,
+                            &alpha, rhs_slice, rhs_cols, lhs_slice, lhs_cols,
+                            &beta, dest_slice, dest_cols
+                        ));
+                    }
+                }
+            }
         }
 
     // ------------------------------------------------------------------------------------

diff --git a/dlib/cuda/cublas_dlibapi.h b/dlib/cuda/cublas_dlibapi.h
@@ -22,21 +22,52 @@ namespace dlib
             const tensor& lhs,
             bool trans_lhs,
             const tensor& rhs,
-            bool trans_rhs
+            bool trans_rhs,
+            size_t g_mode = 0
         );
-        /*!
-            requires
-                - The dimensions of lhs and rhs must be compatible for matrix
-                  multiplication.  In particular:
-                    - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
-                    - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
-                    - Let D == mat(dest)
-                    - D.nr() == L.nr() && D.nc() == R.nc()
-                      (i.e. dest must be preallocated and have the correct output dimensions)
-                    - L.nc() == R.nr()
-            ensures
+    /*!
+        requires
+            - The dimensions of lhs and rhs must be compatible for matrix multiplication.
+            The specific requirements depend on the g_mode:
+
+            For g_mode == 0 (CHANNEL_WISE, default):
+                - Let L == trans_lhs ? trans(mat(lhs)) : mat(lhs)
+                - Let R == trans_rhs ? trans(mat(rhs)) : mat(rhs)
+                - Let D == mat(dest)
+                - D.nr() == L.nr() && D.nc() == R.nc()
+                (i.e. dest must be preallocated and have the correct output dimensions)
+                - L.nc() == R.nr()
+
+            For g_mode == 1 (PLANE_WISE):
+                - lhs.num_samples() == rhs.num_samples() && lhs.k() == rhs.k()
+                - If !trans_lhs && !trans_rhs:
+                    lhs.nc() == rhs.nr()
+                    dest.nr() == lhs.nr() && dest.nc() == rhs.nc()
+                - If trans_lhs && !trans_rhs:
+                    lhs.nr() == rhs.nr()
+                    dest.nr() == lhs.nc() && dest.nc() == rhs.nc()
+                - If !trans_lhs && trans_rhs:
+                    lhs.nc() == rhs.nc()
+                    dest.nr() == lhs.nr() && dest.nc() == rhs.nr()
+                - If trans_lhs && trans_rhs:
+                    lhs.nr() == rhs.nc()
+                    dest.nr() == lhs.nc() && dest.nc() == rhs.nr()
+
+        ensures
+            - Performs matrix multiplication based on the specified g_mode:
+
+            For g_mode == 0 (CHANNEL_WISE):
                 - performs: dest = alpha*L*R + beta*mat(dest)
-        !*/
+                Where L, R, and D are as defined above.
+
+            For g_mode == 1 (PLANE_WISE):
+                - Performs matrix multiplication for each corresponding 2D plane (nr x nc)
+                in lhs and rhs across all samples and channels.
+                - The operation is equivalent to performing the following for each sample
+                and channel:
+                    dest[s][k] = alpha * (lhs[s][k] * rhs[s][k]) + beta * dest[s][k]
+                Where [s][k] represents the 2D plane for sample s and channel k.
+    !*/
 
     // ------------------------------------------------------------------------------------
 

diff --git a/dlib/cuda/tensor_tools.cpp b/dlib/cuda/tensor_tools.cpp
@@ -208,33 +208,99 @@ namespace dlib { namespace tt
         const tensor& lhs,
         bool trans_lhs,
         const tensor& rhs,
-        bool trans_rhs
+        bool trans_rhs,
+        gemm_mode g_mode
     )
     {
 #ifdef DLIB_USE_CUDA
-        cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs);
+        cuda::gemm(beta, dest, alpha, lhs, trans_lhs, rhs, trans_rhs, g_mode);
 #else
-        if (beta != 0)
+        if (g_mode == CHANNEL_WISE)
         {
-            if (trans_lhs && trans_rhs)
-                dest = alpha*trans(mat(lhs))*trans(mat(rhs)) + beta*mat(dest);
-            else if (!trans_lhs && trans_rhs)
-                dest = alpha*mat(lhs)*trans(mat(rhs)) + beta*mat(dest);
-            else if (trans_lhs && !trans_rhs)
-                dest = alpha*trans(mat(lhs))*mat(rhs) + beta*mat(dest);
+            if (beta != 0)
+            {
+                if (trans_lhs && trans_rhs)
+                    dest = alpha * trans(mat(lhs)) * trans(mat(rhs)) + beta * mat(dest);
+                else if (!trans_lhs && trans_rhs)
+                    dest = alpha * mat(lhs) * trans(mat(rhs)) + beta * mat(dest);
+                else if (trans_lhs && !trans_rhs)
+                    dest = alpha * trans(mat(lhs)) * mat(rhs) + beta * mat(dest);
+                else
+                    dest = alpha * mat(lhs) * mat(rhs) + beta * mat(dest);
+            }
             else
-                dest = alpha*mat(lhs)*mat(rhs) + beta*mat(dest);
+            {
+                if (trans_lhs && trans_rhs)
+                    dest = alpha * trans(mat(lhs)) * trans(mat(rhs));
+                else if (!trans_lhs && trans_rhs)
+                    dest = alpha * mat(lhs) * trans(mat(rhs));
+                else if (trans_lhs && !trans_rhs)
+                    dest = alpha * trans(mat(lhs)) * mat(rhs);
+                else
+                    dest = alpha * mat(lhs) * mat(rhs);
+            }
         }
-        else
+        else if (g_mode == PLANE_WISE)
         {
-            if (trans_lhs && trans_rhs)
-                dest = alpha*trans(mat(lhs))*trans(mat(rhs));
-            else if (!trans_lhs && trans_rhs)
-                dest = alpha*mat(lhs)*trans(mat(rhs));
-            else if (trans_lhs && !trans_rhs)
-                dest = alpha*trans(mat(lhs))*mat(rhs);
-            else
-                dest = alpha*mat(lhs)*mat(rhs);
+            auto is_matrix = [](const auto& tensor) {
+                return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) ||
+                    (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1));
+                };
+
+            long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() });
+            long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() });
+            const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest);
+
+            if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) {
+                num_samples = num_channels = 1;
+            }
+
+            long lhs_rows = (lhs_is_matrix && lhs.num_samples() > 1) ? lhs.num_samples() : lhs.nr();
+            long lhs_cols = (lhs_is_matrix && lhs.k() > 1) ? lhs.k() : lhs.nc();
+            long rhs_rows = (rhs_is_matrix && rhs.num_samples() > 1) ? rhs.num_samples() : rhs.nr();
+            long rhs_cols = (rhs_is_matrix && rhs.k() > 1) ? rhs.k() : rhs.nc();
+            long dest_rows = (dest_is_matrix && dest.num_samples() > 1) ? dest.num_samples() : dest.nr();
+            long dest_cols = (dest_is_matrix && dest.k() > 1) ? dest.k() : dest.nc();
+
+            const size_t lhs_plane_size = lhs_rows * lhs_cols;
+            const size_t rhs_plane_size = rhs_rows * rhs_cols;
+            const size_t dest_plane_size = dest_rows * dest_cols;
+
+            for (long b = 0; b < num_samples; ++b)
+            {
+                for (long c = 0; c < num_channels; ++c)
+                {
+                    auto lhs_slice = lhs_is_matrix ? alias_tensor(lhs_rows, lhs_cols)(lhs, 0) :
+                        alias_tensor(lhs_rows, lhs_cols)(lhs, (b * num_channels + c) * lhs_plane_size);
+                    auto rhs_slice = rhs_is_matrix ? alias_tensor(rhs_rows, rhs_cols)(rhs, 0) :
+                        alias_tensor(rhs_rows, rhs_cols)(rhs, (b * num_channels + c) * rhs_plane_size);
+                    auto dest_slice = dest_is_matrix ? alias_tensor(dest_rows, dest_cols)(dest, 0) :
+                        alias_tensor(dest_rows, dest_cols)(dest, (b * num_channels + c) * dest_plane_size);
+
+                    if (beta != 0)
+                    {
+                        if (trans_lhs && trans_rhs)
+                            dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice)) + beta * mat(dest_slice);
+                        else if (!trans_lhs && trans_rhs)
+                            dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice)) + beta * mat(dest_slice);
+                        else if (trans_lhs && !trans_rhs)
+                            dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice) + beta * mat(dest_slice);
+                        else
+                            dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice) + beta * mat(dest_slice);
+                    }
+                    else
+                    {
+                        if (trans_lhs && trans_rhs)
+                            dest_slice = alpha * trans(mat(lhs_slice)) * trans(mat(rhs_slice));
+                        else if (!trans_lhs && trans_rhs)
+                            dest_slice = alpha * mat(lhs_slice) * trans(mat(rhs_slice));
+                        else if (trans_lhs && !trans_rhs)
+                            dest_slice = alpha * trans(mat(lhs_slice)) * mat(rhs_slice);
+                        else
+                            dest_slice = alpha * mat(lhs_slice) * mat(rhs_slice);
+                    }
+                }
+            }
         }
 #endif
     }