Skip to content

Commit

Permalink
Add Transpose Layer for 2D matrix transposition in 4D tensors
Browse files Browse the repository at this point in the history
  • Loading branch information
Cydral authored Sep 16, 2024
1 parent fafdac3 commit d9f5dff
Show file tree
Hide file tree
Showing 10 changed files with 298 additions and 1 deletion.
42 changes: 41 additions & 1 deletion dlib/cuda/cpu_dlib.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2927,7 +2927,47 @@ namespace dlib
}

// ------------------------------------------------------------------------------------
// ------------------------------------------------------------------------------------

void transpose(
bool add,
tensor& dest,
const tensor& src
)
{
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.k() == src.k() &&
dest.nr() == src.nc() &&
dest.nc() == src.nr(),
"Incompatible tensor dimensions.");

const float* src_data = src.host();
float* dest_data = dest.host();

const long num_samples = src.num_samples();
const long k_dim = src.k();
const long src_nr = src.nr();
const long src_nc = src.nc();
const long dest_nr = dest.nr();
const long dest_nc = dest.nc();

parallel_for(0, num_samples * k_dim, [&](long i) {
const long n = i / k_dim;
const long k = i % k_dim;
const long src_nk_offset = (n * src.k() + k) * src_nr;
const long dest_nk_offset = (n * dest.k() + k) * dest_nr;

for (long r = 0; r < src_nr; ++r) {
for (long c = 0; c < src_nc; ++c) {
const long src_idx = (src_nk_offset + r) * src_nc + c;
const long dest_idx = (dest_nk_offset + c) * dest_nc + r;

if (add) dest_data[dest_idx] += src_data[src_idx];
else dest_data[dest_idx] = src_data[src_idx];
}
}
});
}

// ------------------------------------------------------------------------------------

}
Expand Down
8 changes: 8 additions & 0 deletions dlib/cuda/cpu_dlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,14 @@ namespace dlib
size_t count_k
);

// -----------------------------------------------------------------------------------

void transpose(
bool add_to,
tensor& dest,
const tensor& src
);

// -----------------------------------------------------------------------------------

class compute_loss_binary_log_per_pixel
Expand Down
40 changes: 40 additions & 0 deletions dlib/cuda/cuda_dlib.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2500,6 +2500,46 @@ namespace dlib

// ----------------------------------------------------------------------------------------

__global__ void _cuda_transpose(size_t dsize, size_t dk, size_t dnr, size_t dnc, float* d,
size_t sk, size_t snr, int snc, const float* s, const bool add_to)
{
const auto plane_size = dnr * dnc;
const auto sample_size = dk * plane_size;
for (auto i : grid_stride_range(0, dsize))
{
const auto n = i / sample_size;
const auto idx = i % plane_size;
const auto in_k = (i / plane_size) % dk;
const auto in_r = idx % dnc;
const auto in_c = idx / dnc;

const auto in_idx = ((n * sk + in_k) * snr + in_r) * snc + in_c;
if (add_to) d[i] += s[in_idx];
else d[i] = s[in_idx];
}
}

void transpose(
bool add_to,
tensor& dest,
const tensor& src
)
{
DLIB_CASSERT(is_same_object(dest, src) == false);
DLIB_CASSERT(dest.num_samples() == src.num_samples() &&
dest.k() == src.k() &&
dest.nr() == src.nc() &&
dest.nc() == src.nr(),
"Incompatible tensor dimensions.");

launch_kernel(_cuda_transpose, max_jobs(dest.size()), dest.size(),
dest.k(), dest.nr(), dest.nc(), dest.device(),
src.k(), src.nr(), src.nc(), src.device(), add_to);
}

// ----------------------------------------------------------------------------------------


__device__ float cuda_log1pexp(float x)
{
if (x <= -18)
Expand Down
7 changes: 7 additions & 0 deletions dlib/cuda/cuda_dlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,13 @@ namespace dlib
size_t count_k
);

// ----------------------------------------------------------------------------------------

void transpose(
bool add_to,
tensor& dest,
const tensor& src
);

// ----------------------------------------------------------------------------------------

Expand Down
15 changes: 15 additions & 0 deletions dlib/cuda/tensor_tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1279,6 +1279,21 @@ namespace dlib { namespace tt
#endif
}

// ----------------------------------------------------------------------------------------

void transpose(
bool add_to,
tensor& dest,
const tensor& src
)
{
#ifdef DLIB_USE_CUDA
cuda::transpose(add_to, dest, src);
#else
cpu::transpose(add_to, dest, src);
#endif
}

// ----------------------------------------------------------------------------------------

}}
Expand Down
26 changes: 26 additions & 0 deletions dlib/cuda/tensor_tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -2186,6 +2186,32 @@ namespace dlib { namespace tt
i.e., copies content of each sample from src in to corresponding place of sample at dest.
!*/

// ----------------------------------------------------------------------------------------

void transpose(
bool add_to,
tensor& dest,
const tensor& src
);
/*!
requires
- is_same_object(dest, src) == false
- dest.num_samples() == src.num_samples()
- dest.k() == src.k()
- dest.nr() == src.nc()
- dest.nc() == src.nr()
ensures
- Performs a transpose operation on the nr() x nc() matrices within src.
- If (add_to) is false:
- The result is stored in dest, overwriting its previous contents.
- For all valid n, k, r, c:
- #dest(n,k,c,r) == src(n,k,r,c)
- If (add_to) is true:
- The result is added to the existing contents of dest.
- For all valid n, k, r, c:
- #dest(n,k,c,r) == dest(n,k,c,r) + src(n,k,r,c)
!*/

// ----------------------------------------------------------------------------------------

}}
Expand Down
61 changes: 61 additions & 0 deletions dlib/dnn/layers.h
Original file line number Diff line number Diff line change
Expand Up @@ -4635,6 +4635,67 @@ namespace dlib
template <typename SUBNET>
using reorg = add_layer<reorg_<2, 2>, SUBNET>;

// ----------------------------------------------------------------------------------------

class transpose_ {
public:
transpose_() {}
template <typename SUBNET> void setup(const SUBNET& /* sub */) {}

template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output) {
auto& prev = sub.get_output();

output.set_size(prev.num_samples(), prev.k(), prev.nc(), prev.nr());
tt::transpose(false, output, prev);
}

template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& /*params_grad*/) {
auto& prev = sub.get_gradient_input();
tt::transpose(true, prev, gradient_input);
}

inline dpoint map_input_to_output(dpoint p) const
{
dpoint temp_p;
temp_p.x() = p.y();
temp_p.y() = p.x();
return temp_p;
}
inline dpoint map_output_to_input(dpoint p) const
{
dpoint temp_p;
temp_p.x() = p.y();
temp_p.y() = p.x();
return temp_p;
}

const tensor& get_layer_params() const { return params; }
tensor& get_layer_params() { return params; }

friend void serialize(const transpose_& /* item */, std::ostream& out) {
serialize("transpose_", out);
}
friend void deserialize(transpose_& /* item */, std::istream& in) {
std::string version;
deserialize(version, in);
if (version != "transpose_")
throw serialization_error("Unexpected version '" + version + "' found while deserializing dlib::transpose_.");
}

friend std::ostream& operator<<(std::ostream& out, const transpose_& /* item */) {
out << "transpose";
return out;
}
friend void to_xml(const transpose_& /* item */, std::ostream& out) {
out << "<transpose />\n";
}

private:
dlib::resizable_tensor params; // unused
};

template <typename SUBNET> using transpose = add_layer<transpose_, SUBNET>;

// ----------------------------------------------------------------------------------------

}
Expand Down
54 changes: 54 additions & 0 deletions dlib/dnn/layers_abstract.h
Original file line number Diff line number Diff line change
Expand Up @@ -3649,6 +3649,60 @@ namespace dlib
template <typename SUBNET>
using reorg = add_layer<reorg_<2, 2>, SUBNET>;

// ----------------------------------------------------------------------------------------

class transpose_
{
/*!
WHAT THIS OBJECT REPRESENTS
This is an implementation of the EXAMPLE_COMPUTATIONAL_LAYER_ interface
defined above. In particular, this layer performs a 2D matrix transposition
on each of the k planes within each sample of a 4D tensor.
The dimensions of the tensor output by this layer are as follows (letting
IN be the input tensor and OUT the output tensor):
- OUT.num_samples() == IN.num_samples()
- OUT.k() == IN.k()
- OUT.nr() == IN.nc()
- OUT.nc() == IN.nr()
The transposition is performed as follows:
- For each sample i and each k-plane j:
- OUT[i][j][r][c] = IN[i][j][c][r] for all r in [0, IN.nc()) and c in [0, IN.nr())
This layer does not have any learnable parameters.
!*/

public:

transpose_() = default;

template <typename SUBNET> void setup (const SUBNET& sub);
template <typename SUBNET> void forward(const SUBNET& sub, resizable_tensor& output);
template <typename SUBNET> void backward(const tensor& gradient_input, SUBNET& sub, tensor& params_grad);

inline dpoint map_input_to_output(dpoint p) const;
inline dpoint map_output_to_input(dpoint p) const;

const tensor& get_layer_params() const;
tensor& get_layer_params();

friend void serialize(const transpose_& item, std::ostream& out);
friend void deserialize(transpose_& item, std::istream& in);

friend std::ostream& operator<<(std::ostream& out, const transpose_& item);
friend void to_xml(const transpose_& item, std::ostream& out);

/*!
These functions are implemented as described in the EXAMPLE_COMPUTATIONAL_LAYER_ interface.
!*/
private:
resizable_tensor params; // unused
};

template <typename SUBNET>
using transpose = add_layer<transpose_, SUBNET>;

// ----------------------------------------------------------------------------------------

}
Expand Down
8 changes: 8 additions & 0 deletions dlib/dnn/visitors.h
Original file line number Diff line number Diff line change
Expand Up @@ -1021,6 +1021,14 @@ namespace dlib
update(i);
}

template <typename U, typename E>
void operator()(size_t i, const add_layer<transpose_, U, E>&)
{
start_node(i, "transpose");
end_node();
update(i);
}

template <typename T, typename U, typename E>
void operator()(size_t i, const add_layer<T, U, E>&)
{
Expand Down
38 changes: 38 additions & 0 deletions dlib/test/dnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -748,6 +748,37 @@ namespace
#endif
}

// ----------------------------------------------------------------------------------------

void test_transpose()
{
const long num_samples = 2;
const long k = 3;
const long nr = 4;
const long nc = 5;

resizable_tensor input(num_samples, k, nr, nc);
resizable_tensor output_cpu_a(num_samples, k, nc, nr);
tt::tensor_rand rnd(0);
rnd.fill_uniform(input);
resizable_tensor output_cpu_b(input);

cpu::transpose(false, output_cpu_a, input);
cpu::transpose(true, output_cpu_b, output_cpu_a);
input *= 2;
DLIB_TEST(max(abs(mat(output_cpu_b) - mat(input))) < 1e-5);

#ifdef DLIB_USE_CUDA
input /= 2;
resizable_tensor output_cuda_a, output_cuda_b(input);
output_cuda_a.copy_size(output_cpu_a);
cuda::transpose(false, output_cuda_a, input);
cuda::transpose(true, output_cuda_b, output_cuda_a);
DLIB_TEST(max(abs(mat(output_cpu_a) - mat(output_cuda_a))) < 1e-5);
DLIB_TEST(max(abs(mat(output_cpu_b) - mat(output_cuda_b))) < 1e-5);
#endif
}

// ----------------------------------------------------------------------------------------

void test_basic_tensor_ops()
Expand Down Expand Up @@ -2280,6 +2311,12 @@ namespace
auto res = test_layer(l);
DLIB_TEST_MSG(res, res);
}
{
print_spinner();
transpose_ l;
auto res = test_layer(l);
DLIB_TEST_MSG(res, res);
}
}

// ----------------------------------------------------------------------------------------
Expand Down Expand Up @@ -4489,6 +4526,7 @@ namespace
test_batch_normalize_conv();
test_layer_normalize();
test_rms_normalize();
test_transpose();
test_basic_tensor_ops();
test_layers();
test_visit_functions();
Expand Down

0 comments on commit d9f5dff

Please sign in to comment.