Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Microkernels for LogSoftmax and unit tests #46

Open
wants to merge 6 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions include/small/SoftMaxLayer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,55 @@ class SoftMaxLayer : public Layer<BufferT>
}
};

//****************************************************************************
template <typename BufferT>
class LogSoftMaxLayer : public Layer<BufferT>
{
public:
typedef typename BufferT::value_type value_type;

LogSoftMaxLayer(shape_type const &input_shape)
: Layer<BufferT>(input_shape) // input_shape == output_shape
{
#if defined(DEBUG_LAYERS)
auto const &output_shape(this->output_shape());
std::cerr << "LogSoftMax(batches:" << output_shape[BATCH]
<< ",chans:" << output_shape[CHANNEL]
<< ",img:" << output_shape[HEIGHT]
<< "x" << output_shape[WIDTH]
<< ")" << std::endl;
#endif
}

virtual ~LogSoftMaxLayer() {}

virtual void compute_output(
std::vector<Tensor<BufferT> const *> input,
Tensor<BufferT>* output) const
{
if ((input.size() != 1) || (input[0]->shape() != this->output_shape()))
{
throw std::invalid_argument(
"LogSoftMaxLayer::compute_output() ERROR: "
"incorrect input buffer shape.");
}

if (output->capacity() < this->output_size())
{
throw std::invalid_argument(
"LogSoftMaxLayer::compute_output() ERROR: "
"insufficient output buffer space.");
}

auto const &output_shape(this->output_shape());

small::LogSoftMax(output_shape[CHANNEL],
output_shape[HEIGHT], output_shape[WIDTH],
input[0]->buffer(),
output->buffer());

output->set_shape(output_shape);
}
};

}
6 changes: 3 additions & 3 deletions include/small/float_detail/abstract_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ void abstract_layer( /// @todo add B (batch size) param?
ScalarT const *I_buf = I->data(); //__restrict__ ?

ScalarT const *F_buf = nullptr;
if constexpr (op_type == OP_CONV || op_type == OP_LEAKY_RELU || op_type == OP_MUL) // if (F != nullptr)
if constexpr (op_type == OP_CONV || op_type == OP_LEAKY_RELU || op_type == OP_MUL || op_type == OP_EWISE_ADD_SCALAR) // if (F != nullptr)
{
F_buf = F->data();
}
Expand Down Expand Up @@ -288,7 +288,7 @@ void abstract_layer( /// @todo add B (batch size) param?
// if leaky relu, the weight pointer does not change with the group id

ScalarT const *F_group;
if constexpr ((op_type == OP_LEAKY_RELU) || (op_type == OP_MUL))
if constexpr ((op_type == OP_LEAKY_RELU) || (op_type == OP_MUL) || (op_type == OP_EWISE_ADD_SCALAR))
{
F_group = F_buf;
}
Expand All @@ -315,7 +315,7 @@ void abstract_layer( /// @todo add B (batch size) param?
// Loop over input channel reduction
for (index_t i = 0; i < (F_c / _F_cb); i++)
{
bool first = rewrite_output && (i == 0);
bool first = (rewrite_output || op_type == OP_EWISE_ADD_SCALAR) && (i == 0);

ScalarT const *I_channel_block_input =
I_channel_block_output + i * (I_h * I_w * _F_cb * _G_b);
Expand Down
26 changes: 23 additions & 3 deletions include/small/float_detail/abstract_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,19 @@ namespace float_detail
{ \
FLOAT_ACCUM_TILE_C(step, a_cur, O_wb, C_ob); \
} \
else if constexpr (op_type == OP_EWISE_ADD_SCALAR) \
{ \
float scalar = b_cur[0]; \
FLOAT_EWISE_ADD_SCALAR_TILE_C(scalar, O_wb, C_ob); \
} \
else if constexpr (op_type == OP_MUL) \
{ \
float drop_out_rate = b_cur[0]; \
FLOAT_DIV_TILE_C(drop_out_rate, O_wb, C_ob) \
FLOAT_DIV_TILE_C(drop_out_rate, O_wb, C_ob); \
} \
else if constexpr (op_type == OP_EXP) \
{ \
FLOAT_EXP_TILE_C(step, a_cur, O_wb, C_ob) \
FLOAT_EXP_TILE_C(step, a_cur, O_wb, C_ob); \
}

//****************************************************************************
Expand All @@ -80,7 +85,12 @@ namespace float_detail
else if constexpr (op_type == OP_ADD || op_type == OP_AVERAGE_POOL) \
{ \
FLOAT_ACCUM_END_C(step, a_cur, c_cur, W_elements, C_ob); \
} \
} \
else if constexpr (op_type == OP_EWISE_ADD_SCALAR) \
{ \
float scalar = b_cur[0]; \
FLOAT_EWISE_ADD_SCALAR_END_C(c_cur, scalar, W_elements, C_ob); \
} \
else if constexpr (op_type == OP_MUL) \
{ \
float drop_out_rate = b_cur[0]; \
Expand All @@ -105,6 +115,11 @@ namespace float_detail
{ \
FLOAT_ACCUM_TILE_C(step, b_cur, O_wb, C_ob); \
} \
else if constexpr (op_type == OP_EWISE_ADD_SCALAR) \
{ \
float scalar = b_cur[0]; \
FLOAT_EWISE_ADD_SCALAR_TILE_C(scalar, O_wb, C_ob); \
} \
else if constexpr (op_type == OP_MUL) \
{ \
float drop_out_rate = b_cur[0]; \
Expand All @@ -129,6 +144,11 @@ namespace float_detail
{ \
FLOAT_ACCUM_END_C(step, b_cur, c_cur, W_elements, C_ob); \
} \
else if constexpr (op_type == OP_EWISE_ADD_SCALAR) \
{ \
float scalar = b_cur[0]; \
FLOAT_EWISE_ADD_SCALAR_END_C(c_cur, scalar, W_elements, C_ob); \
} \
else if constexpr (op_type == OP_MUL) \
{ \
float drop_out_rate = b_cur[0]; \
Expand Down
13 changes: 10 additions & 3 deletions include/small/float_detail/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,19 +65,26 @@ void inline kernel(
if (first)
{
FLOAT_ZERO_TILE_C(_O_wb, _C_ob);
if (op_type == OP_MAX_POOL || op_type == OP_MUL)
if constexpr(op_type == OP_MAX_POOL || op_type == OP_MUL || op_type == OP_EWISE_ADD_SCALAR)
{
/// @note using platform C_ob
FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, FLOAT_C_ob);
}
else if (op_type == OP_UPSAMPLE)
else if constexpr(op_type == OP_UPSAMPLE)
{
FLOAT_LOAD_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
}
}
else
{
FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
if constexpr(op_type == OP_EWISE_ADD_SCALAR)
{
FLOAT_LOAD_TILE_C_strided(I, step, _O_wb, FLOAT_C_ob);
}
else
{
FLOAT_LOAD_TILE_C(O, _O_wb, _C_ob);
}
if constexpr (op_type == OP_UPSAMPLE)
{
FLOAT_ACCUM_TILE_C_upsample(I, _stride, _C_ib, _O_wb, _C_ob);
Expand Down
13 changes: 10 additions & 3 deletions include/small/float_detail/kernel_right.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void inline kernel_right(
{
FLOAT_ZERO_END_C(O_w_left, _C_ob);

if ( (op_type == OP_MUL)|| (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0))
if ( (op_type == OP_MUL)|| (op_type == OP_EWISE_ADD_SCALAR) || (op_type == OP_MAX_POOL && H_lb == 0 && H_ub == 0))
{
FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
}
Expand All @@ -84,7 +84,14 @@ void inline kernel_right(
{
FLOAT_ZERO_END_C(O_w_left, _C_ob);
}
FLOAT_LOAD_END_C(O, O_w_left, _C_ob);
if constexpr(op_type == OP_EWISE_ADD_SCALAR)
{
FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
}
else
{
FLOAT_LOAD_END_C(O, O_w_left, _C_ob);
}
if constexpr (op_type == OP_UPSAMPLE)
{
FLOAT_ACCUM_END_C_upsample(I, _stride, _C_ib, O_w_left, _C_ob);
Expand Down Expand Up @@ -140,7 +147,7 @@ void inline kernel_right(

//@note padding should always be 'v' for pointwise operations,
// so this code path should not be used
if (op_type == OP_MUL)
if (op_type == OP_MUL || op_type == OP_EWISE_ADD_SCALAR)
{
FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob);
}
Expand Down
4 changes: 2 additions & 2 deletions include/small/float_detail/kernel_right_1D.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ void inline kernel_right_1D(
{
FLOAT_ZERO_END_C(O_w_left, _C_ob);

if ((op_type == OP_MUL) || (op_type == OP_MAX_POOL)) // && H_lb == 0 && H_ub == 0))
if ((op_type == OP_MUL) || (op_type == OP_EWISE_ADD_SCALAR)|| (op_type == OP_MAX_POOL)) // && H_lb == 0 && H_ub == 0))
{
FLOAT_LOAD_END_C_strided(I, step, O_w_left, _C_ob);
}
Expand Down Expand Up @@ -140,7 +140,7 @@ void inline kernel_right_1D(

//@note padding should always be 'v' for pointwise operations,
// so this code path should not be used
if (op_type == OP_MUL)
if constexpr(op_type == OP_MUL || op_type == OP_EWISE_ADD_SCALAR)
{
FLOAT_LOAD_END_C_strided(I_ptr, step, r_pad_el, _C_ob);
}
Expand Down
65 changes: 65 additions & 0 deletions include/small/interface_abstract.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1833,6 +1833,71 @@ void SoftMax(int input_channels,
}
#endif

//============================================================================
#if defined(SMALL_HAS_FLOAT_SUPPORT)
template <class BufferT,
std::enable_if_t<
std::is_same<FloatBuffer, BufferT>::value, bool> = true>
void LogSoftMax(int input_channels,
int input_height, int input_width,
BufferT const &input_buf,
BufferT &output_buf)
{
#if defined(RECORD_CALLS)
std::cout << "LogSoftMax<float>(chans:" << input_channels
<< ",img:" << input_height << "x" << input_width
<< ",I,O)\n";
#endif

if (input_channels % FLOAT_C_ib == 0)
{
// LogSoftMax is a point-wise ADD of input to a global ADD of point-wise exp

// point-wise exponent
float_detail::abstract_layer<
FloatBuffer, FLOAT_C_ob, 1, 1, FLOAT_W_ob, 1, 1, OP_EXP, 0, 1>(
input_channels, // Output Channel Grouping
1, // Output Channels per group
1,
input_height, input_width,
1, 1,
0, 0, 0, 0,
&input_buf, (FloatBuffer *)nullptr, &output_buf);

// global sum
FloatBuffer softmax_norm_buf(1);
float_detail::abstract_layer<
FloatBuffer, 1, 1, FLOAT_C_ob, FLOAT_W_ob, 1, FLOAT_C_ob, OP_ADD, 3, 1>(
1, // Output Channel Grouping
1, // Output Channels per group
input_channels,
input_height, input_width,
input_height, input_width,
0, 0, 0, 0,
&output_buf, (FloatBuffer *)nullptr, &softmax_norm_buf);

softmax_norm_buf.data()[0] = -std::log(softmax_norm_buf.data()[0]);

// element-wise shift
float_detail::abstract_layer<
FloatBuffer, FLOAT_C_ob, 1, 1, FLOAT_W_ob, 1, 1, OP_EWISE_ADD_SCALAR, 0, 0>(
input_channels, // Output Channel Grouping
1, // Output Channels per group
1,
input_height, input_width,
1, 1,
0, 0, 0, 0,
&input_buf, &softmax_norm_buf, &output_buf);

}
else
{
throw std::invalid_argument(
"SoftMax<float> ERROR: in_channels unsupported.");
}
}
#endif

//****************************************************************************
//****************************************************************************
// nearest neighbor upsampling
Expand Down
Empty file.
1 change: 1 addition & 0 deletions include/small/op_type.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ enum OpType
OP_MUL = 6,
OP_UPSAMPLE = 7, // 'u'
OP_EXP = 8,
OP_EWISE_ADD_SCALAR = 9,
OP_NONE = -1
};

Expand Down
60 changes: 60 additions & 0 deletions include/small/platforms/arm/intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,66 @@ else\
}
#endif

//****************************************************************************
// Broadcast addition kernels
//****************************************************************************

#define FLOAT_EWISE_ADD_SCALAR_TILE_C(scalar, W_ob, C_ob) \
float32x4_t av; \
av = vld1q_dup_f32(&scalar);\
c_0_0 = vaddq_f32(c_0_0, av); \
c_0_1 = vaddq_f32(c_0_1, av); \
c_0_2 = vaddq_f32(c_0_2, av); \
c_0_3 = vaddq_f32(c_0_3, av); \
c_1_0 = vaddq_f32(c_1_0, av); \
c_1_1 = vaddq_f32(c_1_1, av); \
c_1_2 = vaddq_f32(c_1_2, av); \
c_1_3 = vaddq_f32(c_1_3, av); \
c_2_0 = vaddq_f32(c_2_0, av); \
c_2_1 = vaddq_f32(c_2_1, av); \
c_2_2 = vaddq_f32(c_2_2, av); \
c_2_3 = vaddq_f32(c_2_3, av); \
c_3_0 = vaddq_f32(c_3_0, av); \
c_3_1 = vaddq_f32(c_3_1, av); \
c_3_2 = vaddq_f32(c_3_2, av); \
c_3_3 = vaddq_f32(c_3_3, av); \
c_4_0 = vaddq_f32(c_4_0, av); \
c_4_1 = vaddq_f32(c_4_1, av); \
c_4_2 = vaddq_f32(c_4_2, av); \
c_4_3 = vaddq_f32(c_4_3, av); \
c_5_0 = vaddq_f32(c_5_0, av); \
c_5_1 = vaddq_f32(c_5_1, av); \
c_5_2 = vaddq_f32(c_5_2, av); \
c_5_3 = vaddq_f32(c_5_3, av);

#if FLOAT_SIMD_EPILOGUE == 1
#define FLOAT_EWISE_ADD_SCALAR_END_C(c_cur, scalar, W_last, C_ob) \
float *c_pixel = c_cur; \
for (uint32_t kk = 0; kk < W_last; kk++) \
{ \
float *c_channel = c_pixel; \
for (uint32_t jj = 0; jj < C_ob; jj++) \
{ \
*(c_channel) += scalar; \
c_channel++; \
} \
c_pixel += C_ob; \
}
#else
#define FLOAT_EWISE_ADD_SCALAR_END_C(c_cur, scalar, W_last, C_ob) \
float32x4_t av; \
av = vld1q_dup_f32(&scalar); \
float32x4_t *c_pixel = c_cur; \
for (uint32_t kk = 0; kk < W_last; kk++) \
{ \
for (uint32_t jj = 0; jj < C_ob / FLOAT_SIMD; jj++) \
{ \
c_pixel[(kk) * (C_ob / FLOAT_SIMD) + jj] = \
vaddq_f32(c_pixel[(kk) * (C_ob / FLOAT_SIMD) + jj], av); \
} \
}
#endif

//****************************************************************************
// Softmax (Ewise exponentiation)
//****************************************************************************
Expand Down
Loading