From 7804c5c437b1e5f4736f4b01d624873d04940983 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 18:23:57 -0700 Subject: [PATCH 01/12] Checkpoint --- benchmarks/utils.py | 3 + nestedtensor/csrc/BinaryOps.cpp | 2 +- nestedtensor/csrc/autograd_functions.cpp | 18 ++++++ nestedtensor/csrc/matmul.cpp | 80 ++++++++++++------------ nestedtensor/csrc/mha.cpp | 64 +++++++++++++++++++ nestedtensor/csrc/mha.h | 0 nestedtensor/csrc/py_init.cpp | 1 + nestedtensor/csrc/utils/nested_node.h | 1 + nestedtensor/nn/mha.py | 18 +++++- nestedtensor/version.py | 4 +- 10 files changed, 147 insertions(+), 44 deletions(-) create mode 100644 nestedtensor/csrc/mha.cpp create mode 100644 nestedtensor/csrc/mha.h diff --git a/benchmarks/utils.py b/benchmarks/utils.py index d1268602..276f1ce3 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -28,6 +28,9 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False) if use_cprofile: pr.enable() fn() + # if t > 1: + # import sys; sys.exit(1) + # import sys; sys.exit(1) if cuda: torch.cuda.synchronize() if use_cprofile: diff --git a/nestedtensor/csrc/BinaryOps.cpp b/nestedtensor/csrc/BinaryOps.cpp index 4b55fa47..c91a0587 100644 --- a/nestedtensor/csrc/BinaryOps.cpp +++ b/nestedtensor/csrc/BinaryOps.cpp @@ -43,7 +43,7 @@ Tensor NestedTensor_binary(const Tensor& self, const Tensor& other) { return map_nested_tensor( [&self](Tensor other) { return func(self, other); }, other); } - if (is_packed(self) && other.dim() == 0) { + if (is_packed(self) && (other.dim() == 0 || (other.dim() == 1 && other.numel() == 1))) { #ifdef TRACEPACKED std::cout << "calling packed binary " << typeid(func).name() << std::endl; #endif diff --git a/nestedtensor/csrc/autograd_functions.cpp b/nestedtensor/csrc/autograd_functions.cpp index 5b83663b..fdf5d8f5 100644 --- a/nestedtensor/csrc/autograd_functions.cpp +++ b/nestedtensor/csrc/autograd_functions.cpp @@ -206,6 +206,11 @@ Tensor NestedTensor_threshold_backward( } Tensor NestedTensor_dropout(const Tensor& input, double p, bool train) { + if (is_packed(input)) { + return wrap_tensor_node(torch::nested_tensor::impl::build_structure( + at::dropout(*get_nested_tensor_structure(input).buffer(), p, train), + get_nested_tensor_impl(input)->nested_size())); + } return autograd_map_nested_tensor( [&](const at::Tensor t) { return at::dropout(t, p, train); }, input); } @@ -284,6 +289,19 @@ Tensor NestedTensor_add(const Tensor& self, const Tensor& other, Scalar alpha) { return map_nested_tensor( [&](at::Tensor o) { return at::add(self, o, alpha); }, other); } + if (is_packed(self) && self.dim() == 3 && other.dim() == 1) { +#ifdef TRACEPACKED + std::cout << "calling packed add" << std::endl; +#endif + auto self_structure = get_nested_tensor_structure(self); + auto self_impl = get_nested_tensor_impl(self); + return wrap_tensor_node(torch::nested_tensor::impl::build_structure( + (*self_structure.buffer()) + .reshape({-1, other.size(0)}) + .add(other) + .reshape({-1}), + self_impl->nested_size())); + } return map_nested_tensor( [&](at::Tensor s) { return at::add(s, other, alpha); }, self); } diff --git a/nestedtensor/csrc/matmul.cpp b/nestedtensor/csrc/matmul.cpp index 72ef3b5c..8a23a215 100644 --- a/nestedtensor/csrc/matmul.cpp +++ b/nestedtensor/csrc/matmul.cpp @@ -11,46 +11,46 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) { AutoGradMode autogradmode(false); auto impl_self = get_nested_tensor_impl(self); auto structure_self = get_nested_tensor_structure(self); - if (is_nested_tensor_impl(other)) { - auto impl_other = get_nested_tensor_impl(other); - auto structure_other = get_nested_tensor_structure(other); - if (structure_self.buffer() && structure_other.buffer() && - self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] && - impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] && - impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] && - impl_other->opt_sizes()[2] && - (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) && - (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) && - (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) { -#ifdef TRACEPACKED - std::cout << "calling packed NT x NT matmul" << std::endl; -#endif - SizeNode new_nested_size = map( - [&](c10::List self_size, c10::List other_size) { - c10::List new_size{ - self_size[0], self_size[1], other_size[2]}; - return std::move(new_size); - }, - impl_self->nested_size(), - impl_other->nested_size()); - auto fn = [](c10::List leaf, int64_t input) { - return input + leaf[0] * leaf[1] * leaf[2]; - }; - int64_t new_numel = reduce>( - new_nested_size, fn, 0); - Tensor new_buffer = at::empty({new_numel}, self.options()); - Tensor result = - wrap_tensor_node(torch::nested_tensor::impl::build_structure( - std::move(new_buffer), new_nested_size)); - apply_nested_tensor( - [](at::Tensor& result, - at::Tensor self, - at::Tensor other) { at::matmul_out(result, self, other); }, - result, - self, - other); - return result; - } + if (is_nested_tensor_impl(other)) { +// auto impl_other = get_nested_tensor_impl(other); +// auto structure_other = get_nested_tensor_structure(other); +// if (structure_self.buffer() && structure_other.buffer() && +// self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] && +// impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] && +// impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] && +// impl_other->opt_sizes()[2] && +// (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) && +// (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) && +// (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) { +// #ifdef TRACEPACKED +// std::cout << "calling packed NT x NT matmul" << std::endl; +// #endif +// SizeNode new_nested_size = map( +// [&](c10::List self_size, c10::List other_size) { +// c10::List new_size{ +// self_size[0], self_size[1], other_size[2]}; +// return std::move(new_size); +// }, +// impl_self->nested_size(), +// impl_other->nested_size()); +// auto fn = [](c10::List leaf, int64_t input) { +// return input + leaf[0] * leaf[1] * leaf[2]; +// }; +// int64_t new_numel = reduce>( +// new_nested_size, fn, 0); +// // Tensor new_buffer = at::empty({new_numel}, self.options()); +// // Tensor result = +// // wrap_tensor_node(torch::nested_tensor::impl::build_structure( +// // std::move(new_buffer), new_nested_size)); +// return map_nested_tensor( +// [](//at::Tensor& result, +// at::Tensor self, +// at::Tensor other) { at::matmul(self, other); }, +// // result, +// self, +// other); +// // return result; +// } return map_nested_tensor( [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp new file mode 100644 index 00000000..8441e49f --- /dev/null +++ b/nestedtensor/csrc/mha.cpp @@ -0,0 +1,64 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +namespace py = pybind11; + +using namespace torch::nested_tensor; +using namespace at; + +namespace torch { +namespace nested_tensor { + +at::Tensor min_mha( + int64_t num_heads, + int64_t head_dim, + double dropout_p, + bool training, + at::Tensor query, + at::Tensor key, + at::Tensor value, + at::Tensor in_proj_weight, + c10::optional in_proj_bias, + double scaling, + at::Tensor out_proj_weight, + at::Tensor out_proj_bias) { + TORCH_CHECK(query.dim() == 3, "query needs to be 3 dim."); + TORCH_CHECK(key.dim() == 3, "key needs to be 3 dim."); + TORCH_CHECK(value.dim() == 3, "value needs to be 3 dim."); + int64_t edim = query.size(2); + + auto q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim)); + auto k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim)); + auto v = at::matmul(value, at::slice(in_proj_weight, 0, 2 * edim)); + if (in_proj_bias) { + q = q + at::slice(*in_proj_bias, 0, 0, edim); + k = k + at::slice(*in_proj_bias, 0, edim, 2 * edim); + v = v + at::slice(*in_proj_bias, 0, 2 * edim); + } + + q = at::mul(q, torch::tensor({scaling}, q.options())); + + q = q.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2); + k = k.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2); + v = v.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2); + auto attn_output_weights = at::matmul(q, k.transpose(2, 3)); + attn_output_weights = at::softmax(attn_output_weights, -1).contiguous(); + attn_output_weights = at::dropout(attn_output_weights, dropout_p, training); + auto attn_output = at::matmul(attn_output_weights, v); + attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim}).contiguous(); + attn_output = at::matmul(attn_output, out_proj_weight); + attn_output += out_proj_bias; + return attn_output; +} + +static auto registry = + torch::RegisterOperators().op("nestedtensor::min_mha", &min_mha); + +} // namespace nested_tensor +} // namespace torch diff --git a/nestedtensor/csrc/mha.h b/nestedtensor/csrc/mha.h new file mode 100644 index 00000000..e69de29b diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp index d2a1579d..c696738b 100644 --- a/nestedtensor/csrc/py_init.cpp +++ b/nestedtensor/csrc/py_init.cpp @@ -7,6 +7,7 @@ #include #include #include +#include // NOTE: A NestedTensor without any constituents, i.e. // nested_tensor([]) is of dimension 1 because diff --git a/nestedtensor/csrc/utils/nested_node.h b/nestedtensor/csrc/utils/nested_node.h index 7a791043..6f2b9f4b 100644 --- a/nestedtensor/csrc/utils/nested_node.h +++ b/nestedtensor/csrc/utils/nested_node.h @@ -402,6 +402,7 @@ inline TensorNode build_structure( inline TensorNode build_structure( at::Tensor&& buffer, const SizeNode& nested_size) { + TORCH_CHECK(buffer.dim() == 1, "Given buffer must be vector, i.e. dim 1 Tensor."); SizeNode nested_stride = map( [](c10::List size) { return _cont_stride(size); }, nested_size); return build_structure(std::move(buffer), nested_size, nested_stride); diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py index d076a28b..ebe89f1b 100644 --- a/nestedtensor/nn/mha.py +++ b/nestedtensor/nn/mha.py @@ -69,7 +69,23 @@ def multi_head_attention_forward(query, # type: Nested head_dim = embed_dim // num_heads assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" scaling = float(head_dim) ** -0.5 - + # print(query.nested_size()) + # print(key.nested_size()) + # print(value.nested_size()) + # print(in_proj_bias.size()) + + return torch.ops.nestedtensor.min_mha(num_heads, + head_dim, + dropout_p, + training, + query._impl, + key._impl, + value._impl, + in_proj_weight, + in_proj_bias, + scaling, + out_proj_weight, + out_proj_bias), None # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 5c8ced47..b17166b5 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.0.1.dev202082421+93f26bc' -git_version = '93f26bcc6cd53fbf7644909e148efcdf2f6f49a3' +__version__ = '0.0.1.dev20208251+17bf81d' +git_version = '17bf81da887e618d3931b229795b6320b26b3f78' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From eec9dfdf38998aa76b6182a60bdef4df6eb7796f Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 19:28:59 -0700 Subject: [PATCH 02/12] Checkpoint --- benchmarks/mha.py | 3 +- benchmarks/utils.py | 2 +- nestedtensor/csrc/matmul.cpp | 109 +++++++++++++++++++++-------------- nestedtensor/csrc/mha.cpp | 1 + nestedtensor/nn/mha.py | 8 +-- nestedtensor/version.py | 4 +- 6 files changed, 76 insertions(+), 51 deletions(-) diff --git a/benchmarks/mha.py b/benchmarks/mha.py index d0f3ba58..9d117260 100644 --- a/benchmarks/mha.py +++ b/benchmarks/mha.py @@ -6,8 +6,9 @@ import random # Performance tanks hard for lots of small Tensors as expected +random.seed(1010) RAND_INTS = [random.randint(10, 30) for _ in range(2000)] -RAND_INTS = [random.randint(100, 300) for _ in range(20)] +RAND_INTS = [random.randint(100, 300) for _ in range(2)] # (26, None, 256) (26, None, 256) (26, None, 256) torch.Size([256, 256]) torch.Size([256]) MODEL0 = torch.nn.MultiheadAttention(256, 8, dropout=0.1).cuda() diff --git a/benchmarks/utils.py b/benchmarks/utils.py index 276f1ce3..0d1991fd 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -30,7 +30,7 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False) fn() # if t > 1: # import sys; sys.exit(1) - # import sys; sys.exit(1) + import sys; sys.exit(1) if cuda: torch.cuda.synchronize() if use_cprofile: diff --git a/nestedtensor/csrc/matmul.cpp b/nestedtensor/csrc/matmul.cpp index 8a23a215..33c0d132 100644 --- a/nestedtensor/csrc/matmul.cpp +++ b/nestedtensor/csrc/matmul.cpp @@ -11,50 +11,73 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) { AutoGradMode autogradmode(false); auto impl_self = get_nested_tensor_impl(self); auto structure_self = get_nested_tensor_structure(self); - if (is_nested_tensor_impl(other)) { -// auto impl_other = get_nested_tensor_impl(other); -// auto structure_other = get_nested_tensor_structure(other); -// if (structure_self.buffer() && structure_other.buffer() && -// self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] && -// impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] && -// impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] && -// impl_other->opt_sizes()[2] && -// (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) && -// (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) && -// (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) { -// #ifdef TRACEPACKED -// std::cout << "calling packed NT x NT matmul" << std::endl; -// #endif -// SizeNode new_nested_size = map( -// [&](c10::List self_size, c10::List other_size) { -// c10::List new_size{ -// self_size[0], self_size[1], other_size[2]}; -// return std::move(new_size); -// }, -// impl_self->nested_size(), -// impl_other->nested_size()); -// auto fn = [](c10::List leaf, int64_t input) { -// return input + leaf[0] * leaf[1] * leaf[2]; -// }; -// int64_t new_numel = reduce>( -// new_nested_size, fn, 0); -// // Tensor new_buffer = at::empty({new_numel}, self.options()); -// // Tensor result = -// // wrap_tensor_node(torch::nested_tensor::impl::build_structure( -// // std::move(new_buffer), new_nested_size)); -// return map_nested_tensor( -// [](//at::Tensor& result, -// at::Tensor self, -// at::Tensor other) { at::matmul(self, other); }, -// // result, -// self, -// other); -// // return result; -// } + if (is_nested_tensor_impl(other)) { + auto impl_other = get_nested_tensor_impl(other); + auto structure_other = get_nested_tensor_structure(other); + if (structure_self.buffer() && structure_other.buffer() && + self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] && + impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] && + impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] && + impl_other->opt_sizes()[2] && + (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) && + (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) && + (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) { +#ifdef TRACEPACKED + std::cout << "calling packed NT x NT matmul" << std::endl; +#endif + SizeNode new_nested_size = map( + [&](c10::List self_size, c10::List other_size) { + std::cout << "self_size: " << self_size[0] << " " << self_size[1] + << " " << self_size[2] << std::endl; + std::cout << "other_size: " << other_size[0] << " " << other_size[1] + << " " << other_size[2] << std::endl; + c10::List new_size{ + self_size[0], self_size[1], other_size[2]}; + std::cout << "new_size: " << new_size[0] << " " << new_size[1] + << " " << new_size[2] << std::endl; + return std::move(new_size); + }, + impl_self->nested_size(), + impl_other->nested_size()); + auto self_buffer = *structure_self.buffer(); + auto other_buffer = *structure_other.buffer(); + self_buffer = self_buffer.reshape({self.size(1), -1, self.size(3)}); + other_buffer = other_buffer.reshape({self.size(1), other.size(2), -1}); + std::cout << self_buffer.sizes() << std::endl; + std::cout << other_buffer.sizes() << std::endl; + auto result_buffer = at::bmm(self_buffer, other_buffer); + std::cout << result_buffer.sizes() << std::endl; + result_buffer = result_buffer.reshape({-1}); + auto rr = map_nested_tensor( + [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other); + std::cout << "rr.numel(): " << rr.numel() << std::endl; + apply_nested_tensor( + [](at::Tensor a) { + std::cout << "a.sizes(): " << a.sizes() << std::endl; + }, + rr); + return wrap_tensor_node(torch::nested_tensor::impl::build_structure( + std::move(result_buffer), new_nested_size)); + // auto fn = [](c10::List leaf, int64_t input) { + // return input + leaf[0] * leaf[1] * leaf[2]; + // }; + // int64_t new_numel = reduce>( + // new_nested_size, fn, 0); + // Tensor new_buffer = at::empty({new_numel}, self.options()); + // Tensor result = + // wrap_tensor_node(torch::nested_tensor::impl::build_structure( + // std::move(new_buffer), new_nested_size)); + // return map_nested_tensor( + // [](//at::Tensor& result, + // at::Tensor self, + // at::Tensor other) { at::matmul(self, other); }, + // // result, + // self, + // other); + // // return result; + } return map_nested_tensor( - [](Tensor s, Tensor o) { return at::matmul(s, o); }, - self, - other); + [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other); } if (structure_self.buffer()) { if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] && diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp index 8441e49f..b46fdc5d 100644 --- a/nestedtensor/csrc/mha.cpp +++ b/nestedtensor/csrc/mha.cpp @@ -33,6 +33,7 @@ at::Tensor min_mha( TORCH_CHECK(value.dim() == 3, "value needs to be 3 dim."); int64_t edim = query.size(2); + //TODO: Use addmm! auto q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim)); auto k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim)); auto v = at::matmul(value, at::slice(in_proj_weight, 0, 2 * edim)); diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py index ebe89f1b..64156b2c 100644 --- a/nestedtensor/nn/mha.py +++ b/nestedtensor/nn/mha.py @@ -69,10 +69,10 @@ def multi_head_attention_forward(query, # type: Nested head_dim = embed_dim // num_heads assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" scaling = float(head_dim) ** -0.5 - # print(query.nested_size()) - # print(key.nested_size()) - # print(value.nested_size()) - # print(in_proj_bias.size()) + print(query.nested_size()) + print(key.nested_size()) + print(value.nested_size()) + print(in_proj_bias.size()) return torch.ops.nestedtensor.min_mha(num_heads, head_dim, diff --git a/nestedtensor/version.py b/nestedtensor/version.py index b17166b5..517ca509 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.0.1.dev20208251+17bf81d' -git_version = '17bf81da887e618d3931b229795b6320b26b3f78' +__version__ = '0.0.1.dev20208252+7804c5c' +git_version = '7804c5c437b1e5f4736f4b01d624873d04940983' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From cae003de0c518bcb7f24f58f3c80e20e49a41f6a Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 20:55:58 -0700 Subject: [PATCH 03/12] Checkpoint --- benchmarks/mha.py | 4 +- benchmarks/utils.py | 2 +- nestedtensor/csrc/matmul.cpp | 153 ++++++++++-------- nestedtensor/csrc/mha.cpp | 42 +++-- nestedtensor/nn/mha.py | 14 +- nestedtensor/version.py | 4 +- .../test_nested_tensor_autograd_functional.py | 3 + 7 files changed, 135 insertions(+), 87 deletions(-) diff --git a/benchmarks/mha.py b/benchmarks/mha.py index 9d117260..d07dc516 100644 --- a/benchmarks/mha.py +++ b/benchmarks/mha.py @@ -8,7 +8,7 @@ # Performance tanks hard for lots of small Tensors as expected random.seed(1010) RAND_INTS = [random.randint(10, 30) for _ in range(2000)] -RAND_INTS = [random.randint(100, 300) for _ in range(2)] +RAND_INTS = [random.randint(100, 300) for _ in range(20)] # (26, None, 256) (26, None, 256) (26, None, 256) torch.Size([256, 256]) torch.Size([256]) MODEL0 = torch.nn.MultiheadAttention(256, 8, dropout=0.1).cuda() @@ -34,4 +34,4 @@ def nt(): if __name__ == "__main__": print(utils.benchmark_fn(gen_nt_segmentation())) - # print(utils.benchmark_fn(gen_t_loop_segmentation())) + print(utils.benchmark_fn(gen_t_loop_segmentation())) diff --git a/benchmarks/utils.py b/benchmarks/utils.py index 0d1991fd..276f1ce3 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -30,7 +30,7 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False) fn() # if t > 1: # import sys; sys.exit(1) - import sys; sys.exit(1) + # import sys; sys.exit(1) if cuda: torch.cuda.synchronize() if use_cprofile: diff --git a/nestedtensor/csrc/matmul.cpp b/nestedtensor/csrc/matmul.cpp index 33c0d132..902bec3c 100644 --- a/nestedtensor/csrc/matmul.cpp +++ b/nestedtensor/csrc/matmul.cpp @@ -14,77 +14,90 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) { if (is_nested_tensor_impl(other)) { auto impl_other = get_nested_tensor_impl(other); auto structure_other = get_nested_tensor_structure(other); - if (structure_self.buffer() && structure_other.buffer() && - self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] && - impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] && - impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] && - impl_other->opt_sizes()[2] && - (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) && - (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) && - (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) { -#ifdef TRACEPACKED - std::cout << "calling packed NT x NT matmul" << std::endl; -#endif - SizeNode new_nested_size = map( - [&](c10::List self_size, c10::List other_size) { - std::cout << "self_size: " << self_size[0] << " " << self_size[1] - << " " << self_size[2] << std::endl; - std::cout << "other_size: " << other_size[0] << " " << other_size[1] - << " " << other_size[2] << std::endl; - c10::List new_size{ - self_size[0], self_size[1], other_size[2]}; - std::cout << "new_size: " << new_size[0] << " " << new_size[1] - << " " << new_size[2] << std::endl; - return std::move(new_size); - }, - impl_self->nested_size(), - impl_other->nested_size()); - auto self_buffer = *structure_self.buffer(); - auto other_buffer = *structure_other.buffer(); - self_buffer = self_buffer.reshape({self.size(1), -1, self.size(3)}); - other_buffer = other_buffer.reshape({self.size(1), other.size(2), -1}); - std::cout << self_buffer.sizes() << std::endl; - std::cout << other_buffer.sizes() << std::endl; - auto result_buffer = at::bmm(self_buffer, other_buffer); - std::cout << result_buffer.sizes() << std::endl; - result_buffer = result_buffer.reshape({-1}); - auto rr = map_nested_tensor( - [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other); - std::cout << "rr.numel(): " << rr.numel() << std::endl; - apply_nested_tensor( - [](at::Tensor a) { - std::cout << "a.sizes(): " << a.sizes() << std::endl; - }, - rr); - return wrap_tensor_node(torch::nested_tensor::impl::build_structure( - std::move(result_buffer), new_nested_size)); - // auto fn = [](c10::List leaf, int64_t input) { - // return input + leaf[0] * leaf[1] * leaf[2]; - // }; - // int64_t new_numel = reduce>( - // new_nested_size, fn, 0); - // Tensor new_buffer = at::empty({new_numel}, self.options()); - // Tensor result = - // wrap_tensor_node(torch::nested_tensor::impl::build_structure( - // std::move(new_buffer), new_nested_size)); - // return map_nested_tensor( - // [](//at::Tensor& result, - // at::Tensor self, - // at::Tensor other) { at::matmul(self, other); }, - // // result, - // self, - // other); - // // return result; - } +// if (structure_self.buffer() && structure_other.buffer() && +// self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] && +// impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] && +// impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] && +// impl_other->opt_sizes()[2] && +// (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) && +// (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) && +// (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) { +//#ifdef TRACEPACKED +// std::cout << "calling packed NT x NT matmul" << std::endl; +//#endif +// SizeNode new_nested_size = map( +// [&](c10::List self_size, c10::List other_size) { +// c10::List new_size{ +// self_size[0], self_size[1], other_size[2]}; +// return std::move(new_size); +// }, +// impl_self->nested_size(), +// impl_other->nested_size()); +// auto fn = [](c10::List leaf, int64_t input) { +// return input + leaf[0] * leaf[1] * leaf[2]; +// }; +// int64_t new_numel = reduce>( +// new_nested_size, fn, 0); +// Tensor new_buffer = at::empty({new_numel}, self.options()); +// Tensor result = +// wrap_tensor_node(torch::nested_tensor::impl::build_structure( +// std::move(new_buffer), new_nested_size)); +// apply_nested_tensor( +// [](at::Tensor& result, at::Tensor self, at::Tensor other) { +// at::matmul_out(result, self, other); +// }, +// result, +// self, +// other); +// return result; +// } return map_nested_tensor( [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other); } +// if (structure_self.buffer()) { +// if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] && +// impl_self->opt_sizes()[2] && +// impl_self->opt_sizes()[self.dim() - 1] == other.size(self.dim() - 2)) { +//#ifdef TRACEPACKED +// std::cout << "calling packed NT x T matmul" << std::endl; +//#endif +// SizeNode new_nested_size = map( +// [&](c10::List self_size) { +// c10::List new_size{self_size[0], other.size(1)}; +// return std::move(new_size); +// }, +// impl_self->nested_size()); +// return wrap_tensor_node(torch::nested_tensor::impl::build_structure( +// at::matmul( +// (*structure_self.buffer()).reshape({-1, other.size(0)}), other) +// .reshape(-1), +// new_nested_size)); +// } +// } + return map_nested_tensor( + [&other](Tensor tensor) { return at::matmul(tensor, other); }, self); +} + +Tensor NestedTensor_addmm( + const Tensor& input, + const Tensor& self, + const Tensor& other, + c10::Scalar alpha, + c10::Scalar beta) { + AutoGradMode autogradmode(false); + TORCH_CHECK(!is_nested_tensor_impl(input), "input must be Tensor"); + TORCH_CHECK(!is_nested_tensor_impl(other), "other must be Tensor"); + TORCH_CHECK(is_nested_tensor_impl(self), "self must be NestedTensor"); + // TORCH_CHECK(alpha == 1, "alpha must be 1."); + // TORCH_CHECK(beta == 1, "beta must be 1."); + auto impl_self = get_nested_tensor_impl(self); + auto structure_self = get_nested_tensor_structure(self); if (structure_self.buffer()) { if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] && impl_self->opt_sizes()[2] && impl_self->opt_sizes()[self.dim() - 1] == other.size(self.dim() - 2)) { #ifdef TRACEPACKED - std::cout << "calling packed NT x T matmul" << std::endl; + std::cout << "calling packed T x NT x T addmm" << std::endl; #endif SizeNode new_nested_size = map( [&](c10::List self_size) { @@ -93,14 +106,21 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) { }, impl_self->nested_size()); return wrap_tensor_node(torch::nested_tensor::impl::build_structure( - at::matmul( - (*structure_self.buffer()).reshape({-1, other.size(0)}), other) + at::addmm( + input, + (*structure_self.buffer()).reshape({-1, other.size(0)}), + other, + alpha, + beta) .reshape(-1), new_nested_size)); } } return map_nested_tensor( - [&other](Tensor tensor) { return at::matmul(tensor, other); }, self); + [&](Tensor tensor) { + return at::addmm(input, tensor, other, alpha, beta); + }, + self); } Tensor& NestedTensor_matmul_out( @@ -120,6 +140,7 @@ Tensor& NestedTensor_matmul_out( TORCH_LIBRARY_IMPL(aten, PrivateUse1_PreAutograd, m) { // nt_impl(m, "matmul", no_bw(TORCH_FN(NestedTensor_matmul); + nt_impl(m, "addmm", NestedTensor_addmm); nt_impl(m, "matmul", NestedTensor_matmul); nt_impl(m, "matmul.out", NestedTensor_matmul_out); } diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp index b46fdc5d..3083fa11 100644 --- a/nestedtensor/csrc/mha.cpp +++ b/nestedtensor/csrc/mha.cpp @@ -33,27 +33,45 @@ at::Tensor min_mha( TORCH_CHECK(value.dim() == 3, "value needs to be 3 dim."); int64_t edim = query.size(2); - //TODO: Use addmm! - auto q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim)); - auto k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim)); - auto v = at::matmul(value, at::slice(in_proj_weight, 0, 2 * edim)); + at::Tensor q, k, v; + // TODO: Use addmm! + // if input.dim() == 2 and bias is not None: + // # fused op is marginally faster + // ret = torch.addmm(bias, input, weight.t()) if (in_proj_bias) { - q = q + at::slice(*in_proj_bias, 0, 0, edim); - k = k + at::slice(*in_proj_bias, 0, edim, 2 * edim); - v = v + at::slice(*in_proj_bias, 0, 2 * edim); + q = at::addmm( + at::slice(*in_proj_bias, 0, 0, edim), + query, + at::slice(in_proj_weight, 0, 0, edim).t(), + scaling, + scaling); + k = at::addmm( + at::slice(*in_proj_bias, 0, edim, 2 * edim), + key, + at::slice(in_proj_weight, 0, edim, 2 * edim).t()); + v = at::addmm( + at::slice(*in_proj_bias, 0, 2 * edim), + value, + at::slice(in_proj_weight, 0, 2 * edim).t()); + // q = q + at::slice(*in_proj_bias, 0, 0, edim); + // k = k + at::slice(*in_proj_bias, 0, edim, 2 * edim); + // v = v + at::slice(*in_proj_bias, 0, 2 * edim); + } else { + q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim).t()); + k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim).t()); + v = at::matmul(value, at::slice(in_proj_weight, 0, 2 * edim).t()); + q = at::mul(q, torch::tensor({scaling}, q.options())); } - q = at::mul(q, torch::tensor({scaling}, q.options())); - q = q.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2); k = k.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2); v = v.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2); auto attn_output_weights = at::matmul(q, k.transpose(2, 3)); - attn_output_weights = at::softmax(attn_output_weights, -1).contiguous(); + attn_output_weights = at::softmax(attn_output_weights, -1); attn_output_weights = at::dropout(attn_output_weights, dropout_p, training); auto attn_output = at::matmul(attn_output_weights, v); - attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim}).contiguous(); - attn_output = at::matmul(attn_output, out_proj_weight); + attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim}); + attn_output = at::matmul(attn_output, out_proj_weight.t()); attn_output += out_proj_bias; return attn_output; } diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py index 64156b2c..a84000f9 100644 --- a/nestedtensor/nn/mha.py +++ b/nestedtensor/nn/mha.py @@ -47,6 +47,9 @@ def multi_head_attention_forward(query, # type: Nested assert isinstance(query, nestedtensor.NestedTensor) assert isinstance(key, nestedtensor.NestedTensor) assert isinstance(value, nestedtensor.NestedTensor) + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() assert torch.is_tensor(out_proj_weight) assert torch.is_tensor(out_proj_bias) @@ -69,10 +72,10 @@ def multi_head_attention_forward(query, # type: Nested head_dim = embed_dim // num_heads assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" scaling = float(head_dim) ** -0.5 - print(query.nested_size()) - print(key.nested_size()) - print(value.nested_size()) - print(in_proj_bias.size()) + # print(query.nested_size()) + # print(key.nested_size()) + # print(value.nested_size()) + # print(in_proj_bias.size()) return torch.ops.nestedtensor.min_mha(num_heads, head_dim, @@ -92,9 +95,11 @@ def multi_head_attention_forward(query, # type: Nested _start = 0 _end = embed_dim _w = in_proj_weight[_start:_end, :] + print(_w.sum()) if _b is not None: _b = _b[_start:_end] q = F.linear(query, _w, _b) + print(q.sum()) # This is inline in_proj function with in_proj_weight and in_proj_bias _b = in_proj_bias @@ -114,6 +119,7 @@ def multi_head_attention_forward(query, # type: Nested _b = _b[_start:] v = F.linear(value, _w, _b) q = q * scaling + print(q.sum()) # NOTE: This is usually contiguous plus a view q = q.reshape(-1, -1, num_heads, head_dim).transpose(1, 2) diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 517ca509..5dbadc67 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.0.1.dev20208252+7804c5c' -git_version = '7804c5c437b1e5f4736f4b01d624873d04940983' +__version__ = '0.0.1.dev20208253+eec9dfd' +git_version = 'eec9dfdf38998aa76b6182a60bdef4df6eb7796f' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION diff --git a/test/test_nested_tensor_autograd_functional.py b/test/test_nested_tensor_autograd_functional.py index b41d5849..7376beef 100644 --- a/test/test_nested_tensor_autograd_functional.py +++ b/test/test_nested_tensor_autograd_functional.py @@ -224,6 +224,7 @@ def _test(FCNHead): def test_mha(self): embed_dim = 2 num_heads = 2 + torch.manual_seed(1010) mha = torch.nn.MultiheadAttention(embed_dim, num_heads) query = torch.randn(3, 1, embed_dim) key = torch.randn(2, 1, embed_dim) @@ -241,6 +242,8 @@ def test_mha(self): query_nt, key_nt, value_nt, need_weights=False) # nt_attn_output.sum().backward() # For regular tensors the batch dimension is along dimension 1 + print(attn_output.sum()) + print(nt_attn_output.sum()) self.assertEqual(attn_output.squeeze(1), nt_attn_output[0]) From ecbbf50c9021d97d2f83d3e6d704f1754b4557fd Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 20:59:51 -0700 Subject: [PATCH 04/12] Checkpoint --- benchmarks/utils.py | 3 - nestedtensor/csrc/autograd_functions.cpp | 5 - nestedtensor/csrc/matmul.cpp | 150 ++++++++++++----------- 3 files changed, 76 insertions(+), 82 deletions(-) diff --git a/benchmarks/utils.py b/benchmarks/utils.py index 276f1ce3..d1268602 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -28,9 +28,6 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False) if use_cprofile: pr.enable() fn() - # if t > 1: - # import sys; sys.exit(1) - # import sys; sys.exit(1) if cuda: torch.cuda.synchronize() if use_cprofile: diff --git a/nestedtensor/csrc/autograd_functions.cpp b/nestedtensor/csrc/autograd_functions.cpp index fdf5d8f5..5018dd82 100644 --- a/nestedtensor/csrc/autograd_functions.cpp +++ b/nestedtensor/csrc/autograd_functions.cpp @@ -206,11 +206,6 @@ Tensor NestedTensor_threshold_backward( } Tensor NestedTensor_dropout(const Tensor& input, double p, bool train) { - if (is_packed(input)) { - return wrap_tensor_node(torch::nested_tensor::impl::build_structure( - at::dropout(*get_nested_tensor_structure(input).buffer(), p, train), - get_nested_tensor_impl(input)->nested_size())); - } return autograd_map_nested_tensor( [&](const at::Tensor t) { return at::dropout(t, p, train); }, input); } diff --git a/nestedtensor/csrc/matmul.cpp b/nestedtensor/csrc/matmul.cpp index 902bec3c..d52b5b16 100644 --- a/nestedtensor/csrc/matmul.cpp +++ b/nestedtensor/csrc/matmul.cpp @@ -14,70 +14,87 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) { if (is_nested_tensor_impl(other)) { auto impl_other = get_nested_tensor_impl(other); auto structure_other = get_nested_tensor_structure(other); -// if (structure_self.buffer() && structure_other.buffer() && -// self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] && -// impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] && -// impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] && -// impl_other->opt_sizes()[2] && -// (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) && -// (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) && -// (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) { -//#ifdef TRACEPACKED -// std::cout << "calling packed NT x NT matmul" << std::endl; -//#endif -// SizeNode new_nested_size = map( -// [&](c10::List self_size, c10::List other_size) { -// c10::List new_size{ -// self_size[0], self_size[1], other_size[2]}; -// return std::move(new_size); -// }, -// impl_self->nested_size(), -// impl_other->nested_size()); -// auto fn = [](c10::List leaf, int64_t input) { -// return input + leaf[0] * leaf[1] * leaf[2]; -// }; -// int64_t new_numel = reduce>( -// new_nested_size, fn, 0); -// Tensor new_buffer = at::empty({new_numel}, self.options()); -// Tensor result = -// wrap_tensor_node(torch::nested_tensor::impl::build_structure( -// std::move(new_buffer), new_nested_size)); -// apply_nested_tensor( -// [](at::Tensor& result, at::Tensor self, at::Tensor other) { -// at::matmul_out(result, self, other); -// }, -// result, -// self, -// other); -// return result; -// } + if (structure_self.buffer() && structure_other.buffer() && + self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] && + impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] && + impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] && + impl_other->opt_sizes()[2] && + (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) && + (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) && + (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) { +#ifdef TRACEPACKED + std::cout << "calling packed NT x NT matmul" << std::endl; +#endif + SizeNode new_nested_size = map( + [&](c10::List self_size, c10::List other_size) { + c10::List new_size{ + self_size[0], self_size[1], other_size[2]}; + return std::move(new_size); + }, + impl_self->nested_size(), + impl_other->nested_size()); + auto fn = [](c10::List leaf, int64_t input) { + return input + leaf[0] * leaf[1] * leaf[2]; + }; + int64_t new_numel = reduce>( + new_nested_size, fn, 0); + Tensor new_buffer = at::empty({new_numel}, self.options()); + Tensor result = + wrap_tensor_node(torch::nested_tensor::impl::build_structure( + std::move(new_buffer), new_nested_size)); + apply_nested_tensor( + [](at::Tensor& result, + at::Tensor self, + at::Tensor other) { at::matmul_out(result, self, other); }, + result, + self, + other); + return result; + } return map_nested_tensor( - [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other); + [](Tensor s, Tensor o) { return at::matmul(s, o); }, + self, + other); + } + if (structure_self.buffer()) { + if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] && + impl_self->opt_sizes()[2] && + impl_self->opt_sizes()[self.dim() - 1] == other.size(self.dim() - 2)) { +#ifdef TRACEPACKED + std::cout << "calling packed NT x T matmul" << std::endl; +#endif + SizeNode new_nested_size = map( + [&](c10::List self_size) { + c10::List new_size{self_size[0], other.size(1)}; + return std::move(new_size); + }, + impl_self->nested_size()); + return wrap_tensor_node(torch::nested_tensor::impl::build_structure( + at::matmul( + (*structure_self.buffer()).reshape({-1, other.size(0)}), other) + .reshape(-1), + new_nested_size)); + } } -// if (structure_self.buffer()) { -// if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] && -// impl_self->opt_sizes()[2] && -// impl_self->opt_sizes()[self.dim() - 1] == other.size(self.dim() - 2)) { -//#ifdef TRACEPACKED -// std::cout << "calling packed NT x T matmul" << std::endl; -//#endif -// SizeNode new_nested_size = map( -// [&](c10::List self_size) { -// c10::List new_size{self_size[0], other.size(1)}; -// return std::move(new_size); -// }, -// impl_self->nested_size()); -// return wrap_tensor_node(torch::nested_tensor::impl::build_structure( -// at::matmul( -// (*structure_self.buffer()).reshape({-1, other.size(0)}), other) -// .reshape(-1), -// new_nested_size)); -// } -// } return map_nested_tensor( [&other](Tensor tensor) { return at::matmul(tensor, other); }, self); } +Tensor& NestedTensor_matmul_out( + Tensor& result, + const Tensor& self, + const Tensor& other) { + AutoGradMode autogradmode(false); + apply_nested_tensor( + [](Tensor& result, Tensor& tensor, Tensor& other) { + return at::matmul_out(result, tensor, other); + }, + result, + self, + other); + return result; +} + Tensor NestedTensor_addmm( const Tensor& input, const Tensor& self, @@ -123,24 +140,9 @@ Tensor NestedTensor_addmm( self); } -Tensor& NestedTensor_matmul_out( - Tensor& result, - const Tensor& self, - const Tensor& other) { - AutoGradMode autogradmode(false); - apply_nested_tensor( - [](Tensor& result, Tensor& tensor, Tensor& other) { - return at::matmul_out(result, tensor, other); - }, - result, - self, - other); - return result; -} - TORCH_LIBRARY_IMPL(aten, PrivateUse1_PreAutograd, m) { - // nt_impl(m, "matmul", no_bw(TORCH_FN(NestedTensor_matmul); nt_impl(m, "addmm", NestedTensor_addmm); + // nt_impl(m, "matmul", no_bw(TORCH_FN(NestedTensor_matmul); nt_impl(m, "matmul", NestedTensor_matmul); nt_impl(m, "matmul.out", NestedTensor_matmul_out); } From 768d2258f61caee8ee45978fbff2489383c68d59 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 21:00:50 -0700 Subject: [PATCH 05/12] Checkpoint --- nestedtensor/nn/mha.py | 51 ------------------------------------------ 1 file changed, 51 deletions(-) diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py index a84000f9..868180e8 100644 --- a/nestedtensor/nn/mha.py +++ b/nestedtensor/nn/mha.py @@ -72,10 +72,6 @@ def multi_head_attention_forward(query, # type: Nested head_dim = embed_dim // num_heads assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" scaling = float(head_dim) ** -0.5 - # print(query.nested_size()) - # print(key.nested_size()) - # print(value.nested_size()) - # print(in_proj_bias.size()) return torch.ops.nestedtensor.min_mha(num_heads, head_dim, @@ -90,53 +86,6 @@ def multi_head_attention_forward(query, # type: Nested out_proj_weight, out_proj_bias), None - # This is inline in_proj function with in_proj_weight and in_proj_bias - _b = in_proj_bias - _start = 0 - _end = embed_dim - _w = in_proj_weight[_start:_end, :] - print(_w.sum()) - if _b is not None: - _b = _b[_start:_end] - q = F.linear(query, _w, _b) - print(q.sum()) - - # This is inline in_proj function with in_proj_weight and in_proj_bias - _b = in_proj_bias - _start = embed_dim - _end = embed_dim * 2 - _w = in_proj_weight[_start:_end, :] - if _b is not None: - _b = _b[_start:_end] - k = F.linear(key, _w, _b) - - # This is inline in_proj function with in_proj_weight and in_proj_bias - _b = in_proj_bias - _start = embed_dim * 2 - _end = None - _w = in_proj_weight[_start:, :] - if _b is not None: - _b = _b[_start:] - v = F.linear(value, _w, _b) - q = q * scaling - print(q.sum()) - - # NOTE: This is usually contiguous plus a view - q = q.reshape(-1, -1, num_heads, head_dim).transpose(1, 2) - if k is not None: - k = k.reshape(-1, -1, num_heads, head_dim).transpose(1, 2) - if v is not None: - v = v.reshape(-1, -1, num_heads, head_dim).transpose(1, 2) - attn_output_weights = torch.matmul(q, k.transpose(2, 3)) - attn_output_weights = F.softmax( - attn_output_weights, dim=-1) - attn_output_weights = F.dropout( - attn_output_weights, p=dropout_p, training=training) - attn_output = torch.matmul(attn_output_weights, v) - attn_output = attn_output.transpose(1, 2).reshape(-1, -1, embed_dim) - attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias) - return attn_output, None - class MultiheadAttention(Module): __annotations__ = { From 759861199cb721bbd76fd56d787881ac096629f6 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 21:01:14 -0700 Subject: [PATCH 06/12] Checkpoint --- nestedtensor/csrc/mha.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp index 3083fa11..62303ff8 100644 --- a/nestedtensor/csrc/mha.cpp +++ b/nestedtensor/csrc/mha.cpp @@ -34,10 +34,6 @@ at::Tensor min_mha( int64_t edim = query.size(2); at::Tensor q, k, v; - // TODO: Use addmm! - // if input.dim() == 2 and bias is not None: - // # fused op is marginally faster - // ret = torch.addmm(bias, input, weight.t()) if (in_proj_bias) { q = at::addmm( at::slice(*in_proj_bias, 0, 0, edim), @@ -53,9 +49,6 @@ at::Tensor min_mha( at::slice(*in_proj_bias, 0, 2 * edim), value, at::slice(in_proj_weight, 0, 2 * edim).t()); - // q = q + at::slice(*in_proj_bias, 0, 0, edim); - // k = k + at::slice(*in_proj_bias, 0, edim, 2 * edim); - // v = v + at::slice(*in_proj_bias, 0, 2 * edim); } else { q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim).t()); k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim).t()); From e8c591a13f9634f64d41241cd7770c5ce1fd9523 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 21:02:25 -0700 Subject: [PATCH 07/12] Checkpoint --- nestedtensor/csrc/py_init.cpp | 2 +- nestedtensor/nn/mha.py | 3 --- nestedtensor/version.py | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp index c696738b..20cc0ef1 100644 --- a/nestedtensor/csrc/py_init.cpp +++ b/nestedtensor/csrc/py_init.cpp @@ -7,7 +7,6 @@ #include #include #include -#include // NOTE: A NestedTensor without any constituents, i.e. // nested_tensor([]) is of dimension 1 because @@ -275,4 +274,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { }); add_functions(m); + add_mha(m); } diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py index 868180e8..963a3444 100644 --- a/nestedtensor/nn/mha.py +++ b/nestedtensor/nn/mha.py @@ -47,9 +47,6 @@ def multi_head_attention_forward(query, # type: Nested assert isinstance(query, nestedtensor.NestedTensor) assert isinstance(key, nestedtensor.NestedTensor) assert isinstance(value, nestedtensor.NestedTensor) - query = query.contiguous() - key = key.contiguous() - value = value.contiguous() assert torch.is_tensor(out_proj_weight) assert torch.is_tensor(out_proj_bias) diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 5dbadc67..2a70879c 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.0.1.dev20208253+eec9dfd' -git_version = 'eec9dfdf38998aa76b6182a60bdef4df6eb7796f' +__version__ = '0.0.1.dev20208254+7598611' +git_version = '759861199cb721bbd76fd56d787881ac096629f6' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From bfefe757d4cbfd7acc7303497980ea9816c8e5f4 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 21:03:30 -0700 Subject: [PATCH 08/12] Checkpoint --- nestedtensor/csrc/py_init.cpp | 1 - nestedtensor/version.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp index 20cc0ef1..d2a1579d 100644 --- a/nestedtensor/csrc/py_init.cpp +++ b/nestedtensor/csrc/py_init.cpp @@ -274,5 +274,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { }); add_functions(m); - add_mha(m); } diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 2a70879c..1e96683d 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.0.1.dev20208254+7598611' -git_version = '759861199cb721bbd76fd56d787881ac096629f6' +__version__ = '0.0.1.dev20208254+e8c591a' +git_version = 'e8c591a13f9634f64d41241cd7770c5ce1fd9523' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 8c381e21692b923eb2ba58b84c1fe5955ae207ad Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 21:04:22 -0700 Subject: [PATCH 09/12] Checkpoint --- nestedtensor/csrc/mha.h | 0 nestedtensor/version.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 nestedtensor/csrc/mha.h diff --git a/nestedtensor/csrc/mha.h b/nestedtensor/csrc/mha.h deleted file mode 100644 index e69de29b..00000000 diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 1e96683d..15a59192 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.0.1.dev20208254+e8c591a' -git_version = 'e8c591a13f9634f64d41241cd7770c5ce1fd9523' +__version__ = '0.0.1.dev20208254+bfefe75' +git_version = 'bfefe757d4cbfd7acc7303497980ea9816c8e5f4' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From c4d8dc2f5786e5ddb96a7458a319c253d0261e73 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 21:07:33 -0700 Subject: [PATCH 10/12] Checkpoint --- nestedtensor/csrc/mha.cpp | 3 +-- nestedtensor/version.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp index 62303ff8..c3357774 100644 --- a/nestedtensor/csrc/mha.cpp +++ b/nestedtensor/csrc/mha.cpp @@ -64,8 +64,7 @@ at::Tensor min_mha( attn_output_weights = at::dropout(attn_output_weights, dropout_p, training); auto attn_output = at::matmul(attn_output_weights, v); attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim}); - attn_output = at::matmul(attn_output, out_proj_weight.t()); - attn_output += out_proj_bias; + attn_output = at::addmm(out_proj_bias, attn_output, out_proj_weight.t()); return attn_output; } diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 15a59192..49ba8adc 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.0.1.dev20208254+bfefe75' -git_version = 'bfefe757d4cbfd7acc7303497980ea9816c8e5f4' +__version__ = '0.0.1.dev20208254+8c381e2' +git_version = '8c381e21692b923eb2ba58b84c1fe5955ae207ad' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From 2e7142023fb185bcfd166784632462e3ab0e87f6 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Mon, 24 Aug 2020 21:50:50 -0700 Subject: [PATCH 11/12] Checkpoint --- benchmarks/frozenbatchnorm2d.py | 73 ++++++++++++++++++++++++ benchmarks/mha.py | 9 ++- nestedtensor/csrc/autograd_functions.cpp | 2 +- nestedtensor/version.py | 4 +- 4 files changed, 80 insertions(+), 8 deletions(-) create mode 100644 benchmarks/frozenbatchnorm2d.py diff --git a/benchmarks/frozenbatchnorm2d.py b/benchmarks/frozenbatchnorm2d.py new file mode 100644 index 00000000..4172ebb8 --- /dev/null +++ b/benchmarks/frozenbatchnorm2d.py @@ -0,0 +1,73 @@ +import torch +import nestedtensor +import utils +import torchvision + +import random + +random.seed(1010) +RAND_INTS = [random.randint(10, 30) for _ in range(2000)] +RAND_INTS = [random.randint(100, 300) for _ in range(20)] + +class FrozenBatchNorm2d(torch.nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, + without which any other models than torchvision.models.resnet[18,34,50,101] + produce nans. + """ + + def __init__(self, n): + super(FrozenBatchNorm2d, self).__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + num_batches_tracked_key = prefix + 'num_batches_tracked' + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super(FrozenBatchNorm2d, self)._load_from_state_dict( + state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs) + + def forward(self, x): + print("DHDHDH") + # move reshapes to the beginning + # to make it fuser-friendly + w = self.weight.reshape(1, -1, 1, 1) + b = self.bias.reshape(1, -1, 1, 1) + rv = self.running_var.reshape(1, -1, 1, 1) + rm = self.running_mean.reshape(1, -1, 1, 1) + eps = 1e-5 + scale = w * (rv + eps).rsqrt() + bias = b - rm * scale + return (x * scale + bias).squeeze(1) + +MODEL = FrozenBatchNorm2d(64) + +def gen_t_loop_frozenbatchnorm2d(): + tensors = [torch.rand(64, i, 256).cuda() for i in RAND_INTS] + + def t_loop(): + for t in tensors: + MODEL(t.unsqueeze(0)) + return t_loop + + +def gen_nt_frozenbatchnorm2d(): + nt0 = nestedtensor.nested_tensor( + [torch.rand(64, i, 256).cuda() for i in RAND_INTS]) + + def nt(): + MODEL(nt0) + return nt + + +if __name__ == "__main__": + print(utils.benchmark_fn(gen_nt_frozenbatchnorm2d())) + print(utils.benchmark_fn(gen_t_loop_frozenbatchnorm2d())) diff --git a/benchmarks/mha.py b/benchmarks/mha.py index d07dc516..8fbcbaf4 100644 --- a/benchmarks/mha.py +++ b/benchmarks/mha.py @@ -5,7 +5,6 @@ import random -# Performance tanks hard for lots of small Tensors as expected random.seed(1010) RAND_INTS = [random.randint(10, 30) for _ in range(2000)] RAND_INTS = [random.randint(100, 300) for _ in range(20)] @@ -14,7 +13,7 @@ MODEL0 = torch.nn.MultiheadAttention(256, 8, dropout=0.1).cuda() MODEL1 = nestedtensor.nn.MultiheadAttention(256, 8, dropout=0.1).cuda() -def gen_t_loop_segmentation(): +def gen_t_loop_mha(): tensors = [torch.rand(1, i, 256).cuda() for i in RAND_INTS] def t_loop(): @@ -23,7 +22,7 @@ def t_loop(): return t_loop -def gen_nt_segmentation(): +def gen_nt_mha(): nt0 = nestedtensor.nested_tensor( [torch.rand(i, 256).cuda() for i in RAND_INTS]) @@ -33,5 +32,5 @@ def nt(): if __name__ == "__main__": - print(utils.benchmark_fn(gen_nt_segmentation())) - print(utils.benchmark_fn(gen_t_loop_segmentation())) + print(utils.benchmark_fn(gen_nt_mha())) + print(utils.benchmark_fn(gen_t_loop_mha())) diff --git a/nestedtensor/csrc/autograd_functions.cpp b/nestedtensor/csrc/autograd_functions.cpp index 5018dd82..7312acc1 100644 --- a/nestedtensor/csrc/autograd_functions.cpp +++ b/nestedtensor/csrc/autograd_functions.cpp @@ -58,7 +58,7 @@ struct NestedTensorFunction_batch_norm cudnn_enabled) .squeeze(0); }, - autograd_input); + autograd_input).contiguous(); ctx->saved_data["0"] = weight; ctx->saved_data["1"] = bias; ctx->saved_data["2"] = autograd_output; diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 49ba8adc..46b83c65 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.0.1.dev20208254+8c381e2' -git_version = '8c381e21692b923eb2ba58b84c1fe5955ae207ad' +__version__ = '0.0.1.dev20208254+c4d8dc2' +git_version = 'c4d8dc2f5786e5ddb96a7458a319c253d0261e73' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION From f5e6a469c1b28d811a5ecec269915876f03039a2 Mon Sep 17 00:00:00 2001 From: Christian Puhrsch Date: Tue, 25 Aug 2020 07:10:18 -0700 Subject: [PATCH 12/12] Checkpoint --- benchmarks/frozenbatchnorm2d.py | 8 ++++++-- benchmarks/mha.py | 5 ----- benchmarks/utils.py | 1 + nestedtensor/csrc/BinaryOps.cpp | 16 ++++++++++------ nestedtensor/csrc/functions.cpp | 2 +- nestedtensor/nn/mha.py | 3 +++ nestedtensor/version.py | 4 ++-- 7 files changed, 23 insertions(+), 16 deletions(-) diff --git a/benchmarks/frozenbatchnorm2d.py b/benchmarks/frozenbatchnorm2d.py index 4172ebb8..4945de2f 100644 --- a/benchmarks/frozenbatchnorm2d.py +++ b/benchmarks/frozenbatchnorm2d.py @@ -36,7 +36,6 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) def forward(self, x): - print("DHDHDH") # move reshapes to the beginning # to make it fuser-friendly w = self.weight.reshape(1, -1, 1, 1) @@ -46,9 +45,14 @@ def forward(self, x): eps = 1e-5 scale = w * (rv + eps).rsqrt() bias = b - rm * scale + # print(scale.size()) + # print(bias.size()) + # print(type(scale)) + # print(type(bias)) + # print(x.nested_size()) return (x * scale + bias).squeeze(1) -MODEL = FrozenBatchNorm2d(64) +MODEL = FrozenBatchNorm2d(64).cuda() def gen_t_loop_frozenbatchnorm2d(): tensors = [torch.rand(64, i, 256).cuda() for i in RAND_INTS] diff --git a/benchmarks/mha.py b/benchmarks/mha.py index 776a9aaf..8fbcbaf4 100644 --- a/benchmarks/mha.py +++ b/benchmarks/mha.py @@ -32,10 +32,5 @@ def nt(): if __name__ == "__main__": -<<<<<<< HEAD print(utils.benchmark_fn(gen_nt_mha())) print(utils.benchmark_fn(gen_t_loop_mha())) -======= - print(utils.benchmark_fn(gen_nt_segmentation())) - print(utils.benchmark_fn(gen_t_loop_segmentation())) ->>>>>>> master diff --git a/benchmarks/utils.py b/benchmarks/utils.py index d1268602..61e0fc5f 100644 --- a/benchmarks/utils.py +++ b/benchmarks/utils.py @@ -28,6 +28,7 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False) if use_cprofile: pr.enable() fn() + # import sys; sys.exit(1) if cuda: torch.cuda.synchronize() if use_cprofile: diff --git a/nestedtensor/csrc/BinaryOps.cpp b/nestedtensor/csrc/BinaryOps.cpp index c91a0587..b020dfe3 100644 --- a/nestedtensor/csrc/BinaryOps.cpp +++ b/nestedtensor/csrc/BinaryOps.cpp @@ -43,14 +43,18 @@ Tensor NestedTensor_binary(const Tensor& self, const Tensor& other) { return map_nested_tensor( [&self](Tensor other) { return func(self, other); }, other); } - if (is_packed(self) && (other.dim() == 0 || (other.dim() == 1 && other.numel() == 1))) { + if (is_packed(self)) { + auto self_structure = get_nested_tensor_structure(self); + auto self_impl = get_nested_tensor_impl(self); + if (other.dim() == 0 || (other.dim() == 1 && other.numel() == 1)) { #ifdef TRACEPACKED - std::cout << "calling packed binary " << typeid(func).name() << std::endl; + std::cout << "calling packed binary NT x T 0-dim / 1-dim 1-numel" + << typeid(func).name() << std::endl; #endif - auto self_structure = get_nested_tensor_structure(self); - return wrap_tensor_node(torch::nested_tensor::impl::build_structure( - func((*self_structure.buffer()), other), - get_nested_tensor_impl(self)->nested_size())); + return wrap_tensor_node(torch::nested_tensor::impl::build_structure( + func((*self_structure.buffer()), other), + get_nested_tensor_impl(self)->nested_size())); + } } return map_nested_tensor( [&other](Tensor self) { return func(self, other); }, self); diff --git a/nestedtensor/csrc/functions.cpp b/nestedtensor/csrc/functions.cpp index aa6b5ddf..d15dd2c6 100644 --- a/nestedtensor/csrc/functions.cpp +++ b/nestedtensor/csrc/functions.cpp @@ -139,7 +139,7 @@ Tensor NestedTensor_layer_norm( [normalized_shape, &weight, &bias, eps](const at::Tensor t) { return at::layer_norm(t, normalized_shape, weight, bias, eps, true); }, - input); + input).contiguous(); } Tensor NestedTensor_all(const Tensor& self) { diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py index 963a3444..84e6f7af 100644 --- a/nestedtensor/nn/mha.py +++ b/nestedtensor/nn/mha.py @@ -49,6 +49,9 @@ def multi_head_attention_forward(query, # type: Nested assert isinstance(value, nestedtensor.NestedTensor) assert torch.is_tensor(out_proj_weight) assert torch.is_tensor(out_proj_bias) + query = query.contiguous() + key = key.contiguous() + value = value.contiguous() # TODO: Explicitly unsupported flags assert not use_separate_proj_weight diff --git a/nestedtensor/version.py b/nestedtensor/version.py index 46b83c65..73166668 100644 --- a/nestedtensor/version.py +++ b/nestedtensor/version.py @@ -1,5 +1,5 @@ -__version__ = '0.0.1.dev20208254+c4d8dc2' -git_version = 'c4d8dc2f5786e5ddb96a7458a319c253d0261e73' +__version__ = '0.0.1.dev20208255+faee8a1' +git_version = 'faee8a1a2578f7ecb80098d2cb792ea7c22e61ab' from nestedtensor import _C if hasattr(_C, 'CUDA_VERSION'): cuda = _C.CUDA_VERSION