davisking · davisking · Dec 20, 2024 · Sep 16, 2024 · Sep 20, 2024 · Sep 23, 2024
diff --git a/dlib/cuda/cpu_dlib.cpp b/dlib/cuda/cpu_dlib.cpp
@@ -1620,122 +1620,175 @@ namespace dlib
 
         namespace ttimpl
         {
-        void softmax (
-            const long num_locations,
-            const long num_channels,
-            tensor& dest,
-            const tensor& src
-        )
-        {
-            DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
-            DLIB_CASSERT(have_same_dimensions(dest,src));
-            const auto d = dest.host();
-            const auto s = src.host();
+            void softmax(
+                const long num_locations,
+                const long num_channels,
+                tensor& dest,
+                const tensor& src,
+                operation_mode mode = operation_mode::CHANNEL_WISE
+            )
+            {
+                DLIB_ASSERT(num_channels * num_locations == src.nr() * src.nc() * src.k());
+                DLIB_CASSERT(have_same_dimensions(dest, src));
+                const auto d = dest.host();
+                const auto s = src.host();
 
-            // Note that we subtract out the max values in each channel before applying
-            // exp() to avoid numeric overflow in the subsequent computations.  Doing this
-            // doesn't change the resulting output, it just makes it more numerically
-            // stable.
-            for (long n = 0; n < src.num_samples(); ++n)
-            {
-                auto ss = s + num_locations*num_channels*n;
-                auto dd = d + num_locations*num_channels*n;
-                for (long i = 0; i < num_locations; ++i)
+                for (long n = 0; n < src.num_samples(); ++n)
                 {
-                    float max_val = -std::numeric_limits<float>::infinity();
-                    for (long k = 0; k < num_channels; ++k)
-                        max_val = std::max(max_val, ss[k*num_locations]);
+                    auto ss = s + num_locations * num_channels * n;
+                    auto dd = d + num_locations * num_channels * n;
 
-                    for (long k = 0; k < num_channels; ++k)
-                        dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
+                    if (mode == operation_mode::CHANNEL_WISE)
+                    {
+                        for (long i = 0; i < num_locations; ++i)
+                        {
+                            float max_val = -std::numeric_limits<float>::infinity();
+                            for (long k = 0; k < num_channels; ++k)
+                                max_val = std::max(max_val, ss[k * num_locations]);
 
-                    ++ss;
-                    ++dd;
-                }
-            }
+                            float sum = 0.0f;
+                            for (long k = 0; k < num_channels; ++k)
+                            {
+                                dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val);
+                                sum += dd[k * num_locations];
+                            }
+                            for (long k = 0; k < num_channels; ++k)
+                                dd[k * num_locations] /= sum;
 
-            // Now normalize each channel so they sum to 1.
-            for (long n = 0; n < src.num_samples(); ++n)
-            {
-                const auto dd = d + num_locations*num_channels*n;
-                for (long i = 0; i < num_locations; ++i)
-                {
-                    const auto ddd = dd+i;
+                            ++ss;
+                            ++dd;
+                        }
+                    }
+                    else if (mode == operation_mode::PLANE_WISE)
+                    {
+                        for (long k = 0; k < num_channels; ++k)
+                        {
+                            auto s_channel = ss + k * num_locations;
+                            auto d_channel = dd + k * num_locations;
+                            for (long r = 0; r < src.nr(); ++r)
+                            {
+                                float max_val = -std::numeric_limits<float>::infinity();
+                                for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                    max_val = std::max(max_val, s_channel[idx]);
 
-                    float temp = 0;
-                    for (long k = 0; k < num_channels; ++k)
-                        temp += ddd[k*num_locations];
-                    for (long k = 0; k < num_channels; ++k)
-                        ddd[k*num_locations] /= temp;
+                                if (max_val == -std::numeric_limits<float>::infinity())
+                                {
+                                    for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                        d_channel[idx] = 0.0f;
+                                }
+                                else
+                                {
+                                    float sum = 0.0f;
+                                    for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                    {
+                                        d_channel[idx] = std::exp(s_channel[idx] - max_val);
+                                        sum += d_channel[idx];
+                                    }
+                                    for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
+                                        d_channel[idx] /= sum;
+                                }
+                            }
+                        }
+                    }
                 }
             }
-        }
 
-        void softmax_gradient (
-            const long num_locations,
-            const long num_channels,
-            tensor& grad,
-            const tensor& dest,
-            const tensor& gradient_input
-        )
-        {
-            DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
-            DLIB_CASSERT(have_same_dimensions(grad,dest));
-            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
-            const auto d = dest.host();
-            const auto g = grad.host();
-            const auto in = gradient_input.host();
-
-
-            for (long n = 0; n < grad.num_samples(); ++n)
+            void softmax_gradient(
+                const long num_locations,
+                const long num_channels,
+                tensor& grad,
+                const tensor& dest,
+                const tensor& gradient_input,
+                operation_mode mode = operation_mode::CHANNEL_WISE
+            )
             {
-                const auto d2 = d + num_locations*num_channels*n;
-                const auto g2 = g + num_locations*num_channels*n;
-                const auto in2 = in + num_locations*num_channels*n;
-                for (long i = 0; i < num_locations; ++i)
+                DLIB_ASSERT(num_channels * num_locations == grad.nr() * grad.nc() * grad.k());
+                DLIB_CASSERT(have_same_dimensions(grad, dest));
+                DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
+
+                const auto d = dest.host();
+                const auto g = grad.host();
+                const auto in = gradient_input.host();
+                for (long n = 0; n < grad.num_samples(); ++n)
                 {
-                    const auto d3 = d2+i;
-                    const auto g3 = g2+i;
-                    const auto in3 = in2+i;
+                    const auto d2 = d + num_locations * num_channels * n;
+                    const auto g2 = g + num_locations * num_channels * n;
+                    const auto in2 = in + num_locations * num_channels * n;
 
-                    float temp = 0;
-                    for (long k = 0; k < num_channels; ++k)
-                        temp += -d3[k*num_locations]*in3[k*num_locations];
-                    if (is_same_object(gradient_input, grad))
+                    if (mode == operation_mode::CHANNEL_WISE)
                     {
-                        for (long k = 0; k < num_channels; ++k)
-                            g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
+                        for (long i = 0; i < num_locations; ++i)
+                        {
+                            const auto d3 = d2 + i;
+                            const auto g3 = g2 + i;
+                            const auto in3 = in2 + i;
+                            float sum = 0.0f;
+                            for (long k = 0; k < num_channels; ++k)
+                                sum += -d3[k * num_locations] * in3[k * num_locations];
+                            if (is_same_object(gradient_input, grad))
+                            {
+                                for (long k = 0; k < num_channels; ++k)
+                                    g3[k * num_locations] = d3[k * num_locations] * (sum + in3[k * num_locations]);
+                            }
+                            else
+                            {
+                                for (long k = 0; k < num_channels; ++k)
+                                    g3[k * num_locations] += d3[k * num_locations] * (sum + in3[k * num_locations]);
+                            }
+                        }
                     }
-                    else
+                    else if (mode == operation_mode::PLANE_WISE)
                     {
                         for (long k = 0; k < num_channels; ++k)
-                            g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
+                        {
+                            const auto d_channel = d2 + k * num_locations;
+                            const auto g_channel = g2 + k * num_locations;
+                            const auto in_channel = in2 + k * num_locations;
+                            for (long r = 0; r < grad.nr(); ++r)
+                            {
+                                float sum = 0.0f;
+                                for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
+                                    sum += -d_channel[idx] * in_channel[idx];
+                                if (is_same_object(gradient_input, grad))
+                                {
+                                    for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
+                                        g_channel[idx] = d_channel[idx] * (sum + in_channel[idx]);
+                                }
+                                else
+                                {
+                                    for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
+                                        g_channel[idx] += d_channel[idx] * (sum + in_channel[idx]);
+                                }
+                            }
+                        }
                     }
                 }
             }
         }
-        }
 
     // ----------------------------------------------------------------------------------------
 
-        void softmax (
+        void softmax(
             tensor& dest,
-            const tensor& src
+            const tensor& src,
+            operation_mode mode
         )
         {
-            DLIB_CASSERT(have_same_dimensions(dest,src));
-            ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
+            DLIB_CASSERT(have_same_dimensions(dest, src));
+            DLIB_CASSERT(mode == operation_mode::CHANNEL_WISE || mode == operation_mode::PLANE_WISE, "Invalid softmax mode");
+            ttimpl::softmax(src.nr() * src.nc(), src.k(), dest, src, mode);
         }
 
-        void softmax_gradient (
+        void softmax_gradient(
             tensor& grad,
             const tensor& dest,
-            const tensor& gradient_input
+            const tensor& gradient_input,
+            operation_mode mode
         )
         {
-            DLIB_CASSERT(have_same_dimensions(grad,dest));
-            DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
-            ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
+            DLIB_CASSERT(have_same_dimensions(grad, dest));
+            DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
+            ttimpl::softmax_gradient(grad.nr() * grad.nc(), grad.k(), grad, dest, gradient_input, mode);
         }
 
     // ------------------------------------------------------------------------------------

diff --git a/dlib/cuda/cpu_dlib.h b/dlib/cuda/cpu_dlib.h
@@ -291,15 +291,17 @@ namespace dlib
 
     // -----------------------------------------------------------------------------------
 
-        void softmax (
+        void softmax(
             tensor& dest,
-            const tensor& src
+            const tensor& src,
+            operation_mode mode = operation_mode::CHANNEL_WISE
         );
 
-        void softmax_gradient (
+        void softmax_gradient(
             tensor& grad,
             const tensor& dest,
-            const tensor& gradient_input
+            const tensor& gradient_input,
+            operation_mode mode = operation_mode::CHANNEL_WISE
         );
 
     // ------------------------------------------------------------------------------------