Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multm_prev_ layer and enhance gemm() function for PLANE_WISE operations #3020

Merged
merged 36 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
4d698fc
Fix Stride Indexing Bugs in `reorg` and `reorg_gradient` Functions (C…
Cydral Sep 16, 2024
1d73b6c
'add_to' parameter missing in cuda call reorg_gradient.launch_kernel()
Cydral Sep 20, 2024
c343779
Cleanup: remove using namespace std; (#3016)
arrufat Sep 23, 2024
724ec09
Merge branch 'refs/heads/master' into Cydral-master
davisking Sep 23, 2024
4dca9b2
fix build error
davisking Sep 23, 2024
2f68a11
Adjust comment formatting to be like other dlib comments
davisking Sep 23, 2024
64e3471
Merge branch 'davisking:master' into master
Cydral Sep 23, 2024
640c02f
Add positional encodings layer to Dlib
Cydral Sep 24, 2024
0f1e250
Add multm_prev layer and enhance gemm() function for PLANE_WISE opera…
Cydral Sep 26, 2024
e8e10ce
Updates
Cydral Sep 26, 2024
06a7f6a
Updates
Cydral Sep 26, 2024
d40171d
Merge branch 'master' into multm-prev-layer
Cydral Sep 30, 2024
0d60627
Resynchronization with tril_ class
Cydral Sep 30, 2024
ed39b2c
Delete .vscode/settings.json
Cydral Oct 6, 2024
8e2a48c
Merge branch 'master' into multm-prev-layer
Cydral Nov 4, 2024
300a8c6
Remove duplicates
Cydral Nov 4, 2024
d173fbd
Small improvements to PLANE_WISE in gemm() function
Cydral Nov 8, 2024
c81efb7
Same improvements for the CPU version
Cydral Nov 11, 2024
89746e2
Merge branch 'davisking:master' into multm-prev-layer
Cydral Nov 18, 2024
3d60227
Introducing a new enum for operation modes in tensor computations
Cydral Nov 18, 2024
a257f02
Remove a test duplicated call in dnn tests
Cydral Nov 18, 2024
21dc524
Remove duplicated declaration
Cydral Nov 18, 2024
439bb87
Comment fixed
Cydral Nov 18, 2024
ca01599
Fixing the Cuda compilation
Cydral Dec 7, 2024
2772dca
Merging with updated softmax_ layer
Cydral Dec 9, 2024
1ff436e
Fixing header for CPU compilation
Cydral Dec 9, 2024
274f32f
Adding a missing cast
Cydral Dec 9, 2024
8685ed8
Test fixed to use the new operation_mode enum
Cydral Dec 10, 2024
275bafc
softmaxm test fixed
Cydral Dec 10, 2024
6beab3b
Enum test removed
Cydral Dec 16, 2024
39b09d9
Enum test removed
Cydral Dec 16, 2024
caed8ff
Fixing indentation
Cydral Dec 16, 2024
fbaa299
Fixing indentation
Cydral Dec 16, 2024
f2dea1e
Test removed
Cydral Dec 16, 2024
c9cc82f
Move the operation_mode enumeration to its own header
Cydral Dec 17, 2024
efda8e7
Use operation_mode instead of unsigned long
davisking Dec 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 137 additions & 84 deletions dlib/cuda/cpu_dlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1620,122 +1620,175 @@ namespace dlib

namespace ttimpl
{
void softmax (
const long num_locations,
const long num_channels,
tensor& dest,
const tensor& src
)
{
DLIB_ASSERT(num_channels*num_locations == src.nr()*src.nc()*src.k());
DLIB_CASSERT(have_same_dimensions(dest,src));
const auto d = dest.host();
const auto s = src.host();
void softmax(
const long num_locations,
const long num_channels,
tensor& dest,
const tensor& src,
operation_mode mode = operation_mode::CHANNEL_WISE
)
{
DLIB_ASSERT(num_channels * num_locations == src.nr() * src.nc() * src.k());
DLIB_CASSERT(have_same_dimensions(dest, src));
const auto d = dest.host();
const auto s = src.host();

// Note that we subtract out the max values in each channel before applying
// exp() to avoid numeric overflow in the subsequent computations. Doing this
// doesn't change the resulting output, it just makes it more numerically
// stable.
for (long n = 0; n < src.num_samples(); ++n)
{
auto ss = s + num_locations*num_channels*n;
auto dd = d + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
for (long n = 0; n < src.num_samples(); ++n)
{
float max_val = -std::numeric_limits<float>::infinity();
for (long k = 0; k < num_channels; ++k)
max_val = std::max(max_val, ss[k*num_locations]);
auto ss = s + num_locations * num_channels * n;
auto dd = d + num_locations * num_channels * n;

for (long k = 0; k < num_channels; ++k)
dd[k*num_locations] = std::exp(ss[k*num_locations]-max_val);
if (mode == operation_mode::CHANNEL_WISE)
{
for (long i = 0; i < num_locations; ++i)
{
float max_val = -std::numeric_limits<float>::infinity();
for (long k = 0; k < num_channels; ++k)
max_val = std::max(max_val, ss[k * num_locations]);

++ss;
++dd;
}
}
float sum = 0.0f;
for (long k = 0; k < num_channels; ++k)
{
dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val);
sum += dd[k * num_locations];
}
for (long k = 0; k < num_channels; ++k)
dd[k * num_locations] /= sum;

// Now normalize each channel so they sum to 1.
for (long n = 0; n < src.num_samples(); ++n)
{
const auto dd = d + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
{
const auto ddd = dd+i;
++ss;
++dd;
}
}
else if (mode == operation_mode::PLANE_WISE)
{
for (long k = 0; k < num_channels; ++k)
{
auto s_channel = ss + k * num_locations;
auto d_channel = dd + k * num_locations;
for (long r = 0; r < src.nr(); ++r)
{
float max_val = -std::numeric_limits<float>::infinity();
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
max_val = std::max(max_val, s_channel[idx]);

float temp = 0;
for (long k = 0; k < num_channels; ++k)
temp += ddd[k*num_locations];
for (long k = 0; k < num_channels; ++k)
ddd[k*num_locations] /= temp;
if (max_val == -std::numeric_limits<float>::infinity())
{
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
d_channel[idx] = 0.0f;
}
else
{
float sum = 0.0f;
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
{
d_channel[idx] = std::exp(s_channel[idx] - max_val);
sum += d_channel[idx];
}
for (long c = 0, idx = r * src.nc(); c < src.nc(); ++c, ++idx)
d_channel[idx] /= sum;
}
}
}
}
}
}
}

void softmax_gradient (
const long num_locations,
const long num_channels,
tensor& grad,
const tensor& dest,
const tensor& gradient_input
)
{
DLIB_ASSERT(num_channels*num_locations == grad.nr()*grad.nc()*grad.k());
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
const auto d = dest.host();
const auto g = grad.host();
const auto in = gradient_input.host();


for (long n = 0; n < grad.num_samples(); ++n)
void softmax_gradient(
const long num_locations,
const long num_channels,
tensor& grad,
const tensor& dest,
const tensor& gradient_input,
operation_mode mode = operation_mode::CHANNEL_WISE
)
{
const auto d2 = d + num_locations*num_channels*n;
const auto g2 = g + num_locations*num_channels*n;
const auto in2 = in + num_locations*num_channels*n;
for (long i = 0; i < num_locations; ++i)
DLIB_ASSERT(num_channels * num_locations == grad.nr() * grad.nc() * grad.k());
DLIB_CASSERT(have_same_dimensions(grad, dest));
DLIB_CASSERT(have_same_dimensions(grad, gradient_input));

const auto d = dest.host();
const auto g = grad.host();
const auto in = gradient_input.host();
for (long n = 0; n < grad.num_samples(); ++n)
{
const auto d3 = d2+i;
const auto g3 = g2+i;
const auto in3 = in2+i;
const auto d2 = d + num_locations * num_channels * n;
const auto g2 = g + num_locations * num_channels * n;
const auto in2 = in + num_locations * num_channels * n;

float temp = 0;
for (long k = 0; k < num_channels; ++k)
temp += -d3[k*num_locations]*in3[k*num_locations];
if (is_same_object(gradient_input, grad))
if (mode == operation_mode::CHANNEL_WISE)
{
for (long k = 0; k < num_channels; ++k)
g3[k*num_locations] = d3[k*num_locations]*(temp+in3[k*num_locations]);
for (long i = 0; i < num_locations; ++i)
{
const auto d3 = d2 + i;
const auto g3 = g2 + i;
const auto in3 = in2 + i;
float sum = 0.0f;
for (long k = 0; k < num_channels; ++k)
sum += -d3[k * num_locations] * in3[k * num_locations];
if (is_same_object(gradient_input, grad))
{
for (long k = 0; k < num_channels; ++k)
g3[k * num_locations] = d3[k * num_locations] * (sum + in3[k * num_locations]);
}
else
{
for (long k = 0; k < num_channels; ++k)
g3[k * num_locations] += d3[k * num_locations] * (sum + in3[k * num_locations]);
}
}
}
else
else if (mode == operation_mode::PLANE_WISE)
{
for (long k = 0; k < num_channels; ++k)
g3[k*num_locations] += d3[k*num_locations]*(temp+in3[k*num_locations]);
{
const auto d_channel = d2 + k * num_locations;
const auto g_channel = g2 + k * num_locations;
const auto in_channel = in2 + k * num_locations;
for (long r = 0; r < grad.nr(); ++r)
{
float sum = 0.0f;
for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
sum += -d_channel[idx] * in_channel[idx];
if (is_same_object(gradient_input, grad))
{
for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
g_channel[idx] = d_channel[idx] * (sum + in_channel[idx]);
}
else
{
for (long c = 0, idx = r * grad.nc(); c < grad.nc(); ++c, ++idx)
g_channel[idx] += d_channel[idx] * (sum + in_channel[idx]);
}
}
}
}
}
}
}
}

// ----------------------------------------------------------------------------------------

void softmax (
void softmax(
tensor& dest,
const tensor& src
const tensor& src,
operation_mode mode
)
{
DLIB_CASSERT(have_same_dimensions(dest,src));
ttimpl::softmax(src.nr()*src.nc(), src.k(), dest, src);
DLIB_CASSERT(have_same_dimensions(dest, src));
DLIB_CASSERT(mode == operation_mode::CHANNEL_WISE || mode == operation_mode::PLANE_WISE, "Invalid softmax mode");
ttimpl::softmax(src.nr() * src.nc(), src.k(), dest, src, mode);
}

void softmax_gradient (
void softmax_gradient(
tensor& grad,
const tensor& dest,
const tensor& gradient_input
const tensor& gradient_input,
operation_mode mode
)
{
DLIB_CASSERT(have_same_dimensions(grad,dest));
DLIB_CASSERT(have_same_dimensions(grad,gradient_input));
ttimpl::softmax_gradient(grad.nr()*grad.nc(), grad.k(), grad, dest, gradient_input);
DLIB_CASSERT(have_same_dimensions(grad, dest));
DLIB_CASSERT(have_same_dimensions(grad, gradient_input));
ttimpl::softmax_gradient(grad.nr() * grad.nc(), grad.k(), grad, dest, gradient_input, mode);
}

// ------------------------------------------------------------------------------------
Expand Down
10 changes: 6 additions & 4 deletions dlib/cuda/cpu_dlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -291,15 +291,17 @@ namespace dlib

// -----------------------------------------------------------------------------------

void softmax (
void softmax(
tensor& dest,
const tensor& src
const tensor& src,
operation_mode mode = operation_mode::CHANNEL_WISE
);

void softmax_gradient (
void softmax_gradient(
tensor& grad,
const tensor& dest,
const tensor& gradient_input
const tensor& gradient_input,
operation_mode mode = operation_mode::CHANNEL_WISE
);

// ------------------------------------------------------------------------------------
Expand Down
Loading
Loading