CMU-SPEED · sei-alhorgan · Feb 7, 2025 · Feb 18, 2025 · Feb 19, 2025 · Feb 18, 2025
diff --git a/include/small/SoftMaxLayer.hpp b/include/small/SoftMaxLayer.hpp
@@ -70,4 +70,55 @@ class SoftMaxLayer : public Layer<BufferT>
     }
 };
 
+//****************************************************************************
+template <typename BufferT>
+class LogSoftMaxLayer : public Layer<BufferT>
+{
+public:
+    typedef typename BufferT::value_type value_type;
+
+    LogSoftMaxLayer(shape_type const &input_shape)
+        : Layer<BufferT>(input_shape)       // input_shape == output_shape
+    {
+#if defined(DEBUG_LAYERS)
+        auto const &output_shape(this->output_shape());
+        std::cerr << "LogSoftMax(batches:" << output_shape[BATCH]
+                  << ",chans:" << output_shape[CHANNEL]
+                  << ",img:" << output_shape[HEIGHT]
+                  << "x" << output_shape[WIDTH]
+                  << ")" << std::endl;
+#endif
+    }
+
+    virtual ~LogSoftMaxLayer() {}
+
+    virtual void compute_output(
+        std::vector<Tensor<BufferT> const *> input,
+        Tensor<BufferT>*                     output) const
+    {
+        if ((input.size() != 1) || (input[0]->shape() != this->output_shape()))
+        {
+            throw std::invalid_argument(
+                "LogSoftMaxLayer::compute_output() ERROR: "
+                "incorrect input buffer shape.");
+        }
+
+        if (output->capacity() < this->output_size())
+        {
+            throw std::invalid_argument(
+                "LogSoftMaxLayer::compute_output() ERROR: "
+                "insufficient output buffer space.");
+        }
+
+        auto const &output_shape(this->output_shape());
+
+        small::LogSoftMax(output_shape[CHANNEL],
+                       output_shape[HEIGHT], output_shape[WIDTH],
+                       input[0]->buffer(),
+                       output->buffer());
+
+        output->set_shape(output_shape);
+    }
+};
+
 }
diff --git a/include/small/float_detail/abstract_layer.hpp b/include/small/float_detail/abstract_layer.hpp
@@ -78,7 +78,7 @@ void abstract_layer( /// @todo add B (batch size) param?
     ScalarT const *I_buf = I->data(); //__restrict__ ?
 
     ScalarT const *F_buf = nullptr;
-    if constexpr (op_type == OP_CONV || op_type == OP_LEAKY_RELU || op_type == OP_MUL) // if (F != nullptr)
+    if constexpr (op_type == OP_CONV || op_type == OP_LEAKY_RELU || op_type == OP_MUL || op_type == OP_EWISE_ADD_SCALAR) // if (F != nullptr)
     {
         F_buf = F->data();
     }
@@ -288,7 +288,7 @@ void abstract_layer( /// @todo add B (batch size) param?
             // if leaky relu, the weight pointer does not change with the group id
 
             ScalarT const *F_group;
-            if constexpr ((op_type == OP_LEAKY_RELU) || (op_type == OP_MUL))
+            if constexpr ((op_type == OP_LEAKY_RELU) || (op_type == OP_MUL) || (op_type == OP_EWISE_ADD_SCALAR))
             {
                 F_group = F_buf;
             }
@@ -315,7 +315,7 @@ void abstract_layer( /// @todo add B (batch size) param?
                 // Loop over input channel reduction
                 for (index_t i = 0; i < (F_c / _F_cb); i++)
                 {
-                    bool first = rewrite_output && (i == 0);
+                    bool first = (rewrite_output || op_type == OP_EWISE_ADD_SCALAR) && (i == 0);
 
                     ScalarT const *I_channel_block_input =
                         I_channel_block_output + i * (I_h * I_w * _F_cb * _G_b);

diff --git a/include/small/float_detail/abstract_op.hpp b/include/small/float_detail/abstract_op.hpp
@@ -46,14 +46,19 @@ namespace float_detail
     {                                                                    \
         FLOAT_ACCUM_TILE_C(step, a_cur, O_wb, C_ob);                     \
     }                                                                    \
+    else if constexpr (op_type == OP_EWISE_ADD_SCALAR)               \
+    {                                                                   \
+        float scalar = b_cur[0];                                        \
+        FLOAT_EWISE_ADD_SCALAR_TILE_C(scalar, O_wb, C_ob);                    \
+    }                                                                  \
     else if constexpr (op_type == OP_MUL)                                \
     {                                                                    \
         float drop_out_rate = b_cur[0];                                  \
-        FLOAT_DIV_TILE_C(drop_out_rate, O_wb, C_ob)                      \
+        FLOAT_DIV_TILE_C(drop_out_rate, O_wb, C_ob);                      \
     }                                                                    \
     else if constexpr (op_type == OP_EXP)                                \
     {                                                                    \
-        FLOAT_EXP_TILE_C(step, a_cur, O_wb, C_ob)                        \
+        FLOAT_EXP_TILE_C(step, a_cur, O_wb, C_ob);                        \
     }
 
 //****************************************************************************
@@ -80,7 +85,12 @@ namespace float_detail
     else if constexpr (op_type == OP_ADD || op_type == OP_AVERAGE_POOL)       \
     {                                                                         \
         FLOAT_ACCUM_END_C(step, a_cur, c_cur, W_elements, C_ob);              \
-    }                                                                         \
+    }                                                                           \
+    else if constexpr (op_type == OP_EWISE_ADD_SCALAR)                      \
+    {                                                                           \
+        float scalar = b_cur[0];                                                \
+        FLOAT_EWISE_ADD_SCALAR_END_C(c_cur, scalar, W_elements, C_ob);               \
+    }                                                                           \
     else if constexpr (op_type == OP_MUL)                                     \
     {                                                                         \
         float drop_out_rate = b_cur[0];                                       \
@@ -105,6 +115,11 @@ namespace float_detail
     {                                                                   \
         FLOAT_ACCUM_TILE_C(step, b_cur, O_wb, C_ob);                    \
     }                                                                   \
+    else if constexpr (op_type == OP_EWISE_ADD_SCALAR)              \
+    {                                                                   \
+        float scalar = b_cur[0];                                        \
+        FLOAT_EWISE_ADD_SCALAR_TILE_C(scalar, O_wb, C_ob);                    \
+    }                                                                  \
     else if constexpr (op_type == OP_MUL)                               \
     {                                                                   \
         float drop_out_rate = b_cur[0];                                 \
@@ -129,6 +144,11 @@ namespace float_detail
     {                                                                   \
         FLOAT_ACCUM_END_C(step, b_cur, c_cur, W_elements, C_ob);        \
     }                                                                   \
+    else if constexpr (op_type == OP_EWISE_ADD_SCALAR)              \
+    {                                                                   \
+        float scalar = b_cur[0];                                        \
+        FLOAT_EWISE_ADD_SCALAR_END_C(c_cur, scalar, W_elements, C_ob);       \
+    }                                                                  \
     else if constexpr (op_type == OP_MUL)                               \
     {                                                                   \
         float drop_out_rate = b_cur[0];                                 \

diff --git a/include/small/float_detail/kernel.hpp b/include/small/float_detail/kernel.hpp
@@ -65,19 +65,26 @@ void inline kernel(
     if (first)
     {
         FLOAT_ZERO_TILE_C(_O_wb, _C_ob);
-        if (op_type == OP_MAX_POOL || op_type == OP_MUL)
+        if constexpr(op_type == OP_MAX_POOL || op_type == OP_MUL || op_type == OP_EWISE_ADD_SCALAR)
         {
             /// @note using platform C_ob
             FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, FLOAT_C_ob);
         }
-        else if (op_type == OP_UPSAMPLE)
+        else if constexpr(op_type == OP_UPSAMPLE)
         {
             FLOAT_LOAD_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
         }
     }
     else
     {
-        FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
+        if constexpr(op_type == OP_EWISE_ADD_SCALAR) 
+        {
+            FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, FLOAT_C_ob);
+        }
+        else 
+        {
+            FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
+        }
         if constexpr (op_type == OP_UPSAMPLE)
         {
             FLOAT_ACCUM_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);

diff --git a/include/small/float_detail/kernel_right.hpp b/include/small/float_detail/kernel_right.hpp
@@ -68,7 +68,7 @@ void inline kernel_right(
         {
             FLOAT_ZERO_END_C(O_w_left, _C_ob);
 
-            if ( (op_type == OP_MUL)|| (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0))
+            if ( (op_type == OP_MUL)|| (op_type == OP_EWISE_ADD_SCALAR) || (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0))
             {
                 FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
             }
@@ -84,7 +84,14 @@ void inline kernel_right(
             {
                 FLOAT_ZERO_END_C(O_w_left, _C_ob);
             }
-            FLOAT_LOAD_END_C(O, O_w_left, _C_ob);
+            if constexpr(op_type == OP_EWISE_ADD_SCALAR) 
+            {
+                FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
+            }
+            else 
+            {
+                FLOAT_LOAD_END_C(O, O_w_left, _C_ob);
+            }
             if constexpr (op_type == OP_UPSAMPLE)
             {
                 FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
@@ -140,7 +147,7 @@ void inline kernel_right(
 
         //@note padding should always be 'v' for pointwise operations,
         //      so this code path should not be used
-        if (op_type == OP_MUL)
+        if (op_type == OP_MUL || op_type == OP_EWISE_ADD_SCALAR)
         {
             FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob);
         }

diff --git a/include/small/float_detail/kernel_right_1D.hpp b/include/small/float_detail/kernel_right_1D.hpp
@@ -68,7 +68,7 @@ void inline kernel_right_1D(
         {
             FLOAT_ZERO_END_C(O_w_left, _C_ob);
 
-            if ((op_type == OP_MUL) || (op_type == OP_MAX_POOL)) // && H_lb == 0 && H_ub == 0))
+            if ((op_type == OP_MUL) || (op_type == OP_EWISE_ADD_SCALAR)|| (op_type == OP_MAX_POOL)) // && H_lb == 0 && H_ub == 0))
             {
                 FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
             }
@@ -140,7 +140,7 @@ void inline kernel_right_1D(
 
         //@note padding should always be 'v' for pointwise operations,
         //      so this code path should not be used
-        if (op_type == OP_MUL)
+        if constexpr(op_type == OP_MUL || op_type == OP_EWISE_ADD_SCALAR)
         {
             FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob);
         }

diff --git a/include/small/interface_abstract.hpp b/include/small/interface_abstract.hpp
@@ -1833,6 +1833,71 @@ void SoftMax(int input_channels,
 }
 #endif
 
+//============================================================================
+#if defined(SMALL_HAS_FLOAT_SUPPORT)
+template <class BufferT,
+          std::enable_if_t<
+              std::is_same<FloatBuffer, BufferT>::value, bool> = true>
+void LogSoftMax(int input_channels,
+                          int input_height, int input_width,
+                          BufferT const &input_buf,
+                          BufferT       &output_buf)
+{
+#if defined(RECORD_CALLS)
+    std::cout << "LogSoftMax<float>(chans:" << input_channels
+                << ",img:" << input_height << "x" << input_width
+                << ",I,O)\n";
+#endif
+
+    if (input_channels % FLOAT_C_ib == 0)
+    {
+        // LogSoftMax is a point-wise ADD of input to a global ADD of point-wise exp
+
+        // point-wise exponent
+        float_detail::abstract_layer<
+            FloatBuffer, FLOAT_C_ob, 1, 1, FLOAT_W_ob, 1, 1, OP_EXP, 0, 1>(
+            input_channels, // Output Channel Grouping
+            1,              // Output Channels per group
+            1,
+            input_height, input_width,
+            1, 1,
+            0, 0, 0, 0,
+            &input_buf, (FloatBuffer *)nullptr, &output_buf);
+
+        // global sum
+        FloatBuffer softmax_norm_buf(1);
+        float_detail::abstract_layer<
+            FloatBuffer, 1, 1, FLOAT_C_ob, FLOAT_W_ob, 1, FLOAT_C_ob, OP_ADD, 3, 1>(
+            1, // Output Channel Grouping
+            1, // Output Channels per group
+            input_channels,
+            input_height, input_width,
+            input_height, input_width,
+            0, 0, 0, 0,
+            &output_buf, (FloatBuffer *)nullptr, &softmax_norm_buf);
+
+        softmax_norm_buf.data()[0] = -std::log(softmax_norm_buf.data()[0]);
+
+        // element-wise shift
+        float_detail::abstract_layer<
+            FloatBuffer, FLOAT_C_ob, 1, 1, FLOAT_W_ob, 1, 1, OP_EWISE_ADD_SCALAR, 0, 0>(
+            input_channels, // Output Channel Grouping
+            1,              // Output Channels per group
+            1,
+            input_height, input_width,
+            1, 1,
+            0, 0, 0, 0,
+            &input_buf, &softmax_norm_buf, &output_buf);
+
+    }
+    else
+    {
+        throw std::invalid_argument(
+            "SoftMax<float> ERROR: in_channels unsupported.");
+    }
+}
+#endif
+
 //****************************************************************************
 //****************************************************************************
 // nearest neighbor upsampling

diff --git a/include/small/models/Resnet1D.hpp b/include/small/models/Resnet1D.hpp
diff --git a/include/small/op_type.hpp b/include/small/op_type.hpp
@@ -27,6 +27,7 @@ enum OpType
     OP_MUL = 6,
     OP_UPSAMPLE = 7,      // 'u'
     OP_EXP = 8,
+    OP_EWISE_ADD_SCALAR = 9,
     OP_NONE = -1
 };
 

diff --git a/include/small/platforms/arm/intrinsics.h b/include/small/platforms/arm/intrinsics.h
@@ -714,6 +714,66 @@ else\
     }
 #endif
 
+//****************************************************************************
+// Broadcast addition kernels
+//****************************************************************************
+
+#define FLOAT_EWISE_ADD_SCALAR_TILE_C(scalar, W_ob, C_ob)        \
+     float32x4_t av;                                     \
+     av = vld1q_dup_f32(&scalar);\
+    c_0_0 = vaddq_f32(c_0_0, av);                       \
+    c_0_1 = vaddq_f32(c_0_1, av);                       \
+    c_0_2 = vaddq_f32(c_0_2, av);                       \
+    c_0_3 = vaddq_f32(c_0_3, av);                       \
+    c_1_0 = vaddq_f32(c_1_0, av);                       \
+    c_1_1 = vaddq_f32(c_1_1, av);                       \
+    c_1_2 = vaddq_f32(c_1_2, av);                       \
+    c_1_3 = vaddq_f32(c_1_3, av);                       \
+    c_2_0 = vaddq_f32(c_2_0, av);                       \
+    c_2_1 = vaddq_f32(c_2_1, av);                       \
+    c_2_2 = vaddq_f32(c_2_2, av);                       \
+    c_2_3 = vaddq_f32(c_2_3, av);                       \
+    c_3_0 = vaddq_f32(c_3_0, av);                       \
+    c_3_1 = vaddq_f32(c_3_1, av);                       \
+    c_3_2 = vaddq_f32(c_3_2, av);                       \
+    c_3_3 = vaddq_f32(c_3_3, av);                       \
+    c_4_0 = vaddq_f32(c_4_0, av);                       \
+    c_4_1 = vaddq_f32(c_4_1, av);                       \
+    c_4_2 = vaddq_f32(c_4_2, av);                       \
+    c_4_3 = vaddq_f32(c_4_3, av);                       \
+    c_5_0 = vaddq_f32(c_5_0, av);                       \
+    c_5_1 = vaddq_f32(c_5_1, av);                       \
+    c_5_2 = vaddq_f32(c_5_2, av);                       \
+    c_5_3 = vaddq_f32(c_5_3, av);
+
+#if FLOAT_SIMD_EPILOGUE == 1
+#define FLOAT_EWISE_ADD_SCALAR_END_C(c_cur, scalar, W_last, C_ob) \
+  float *c_pixel = c_cur;                  \
+  for (uint32_t kk = 0; kk < W_last; kk++)  \
+  {                                         \
+    float *c_channel = c_pixel;             \
+    for (uint32_t jj = 0; jj < C_ob; jj++)  \
+    {                                       \
+      *(c_channel) += scalar;                 \
+      c_channel++;                          \
+    }                                       \
+    c_pixel += C_ob;                        \
+    }
+#else
+#define FLOAT_EWISE_ADD_SCALAR_END_C(c_cur, scalar, W_last, C_ob)                      \
+  float32x4_t av;                                                \
+  av = vld1q_dup_f32(&scalar);                                     \
+  float32x4_t *c_pixel = c_cur;                                   \
+  for (uint32_t kk = 0; kk < W_last; kk++)                       \
+  {                                                              \
+    for (uint32_t jj = 0; jj < C_ob / FLOAT_SIMD; jj++)          \
+    {                                                            \
+      c_pixel[(kk) * (C_ob / FLOAT_SIMD) + jj] =                   \
+          vaddq_f32(c_pixel[(kk) * (C_ob / FLOAT_SIMD) + jj], av); \
+    }                                                            \
+    }
+#endif
+
 //****************************************************************************
 // Softmax  (Ewise exponentiation)
 //****************************************************************************