From 7804c5c437b1e5f4736f4b01d624873d04940983 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 18:23:57 -0700
Subject: [PATCH 01/12] Checkpoint

---
 benchmarks/utils.py                      |  3 +
 nestedtensor/csrc/BinaryOps.cpp          |  2 +-
 nestedtensor/csrc/autograd_functions.cpp | 18 ++++++
 nestedtensor/csrc/matmul.cpp             | 80 ++++++++++++------------
 nestedtensor/csrc/mha.cpp                | 64 +++++++++++++++++++
 nestedtensor/csrc/mha.h                  |  0
 nestedtensor/csrc/py_init.cpp            |  1 +
 nestedtensor/csrc/utils/nested_node.h    |  1 +
 nestedtensor/nn/mha.py                   | 18 +++++-
 nestedtensor/version.py                  |  4 +-
 10 files changed, 147 insertions(+), 44 deletions(-)
 create mode 100644 nestedtensor/csrc/mha.cpp
 create mode 100644 nestedtensor/csrc/mha.h

diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index d1268602..276f1ce3 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -28,6 +28,9 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False)
         if use_cprofile:
             pr.enable()
         fn()
+        # if t > 1:
+        #     import sys; sys.exit(1)
+        # import sys; sys.exit(1)
         if cuda:
             torch.cuda.synchronize()
         if use_cprofile:
diff --git a/nestedtensor/csrc/BinaryOps.cpp b/nestedtensor/csrc/BinaryOps.cpp
index 4b55fa47..c91a0587 100644
--- a/nestedtensor/csrc/BinaryOps.cpp
+++ b/nestedtensor/csrc/BinaryOps.cpp
@@ -43,7 +43,7 @@ Tensor NestedTensor_binary(const Tensor& self, const Tensor& other) {
     return map_nested_tensor(
         [&self](Tensor other) { return func(self, other); }, other);
   }
-  if (is_packed(self) && other.dim() == 0) {
+  if (is_packed(self) && (other.dim() == 0 || (other.dim() == 1 && other.numel() == 1))) {
 #ifdef TRACEPACKED
     std::cout << "calling packed binary " << typeid(func).name() << std::endl;
 #endif
diff --git a/nestedtensor/csrc/autograd_functions.cpp b/nestedtensor/csrc/autograd_functions.cpp
index 5b83663b..fdf5d8f5 100644
--- a/nestedtensor/csrc/autograd_functions.cpp
+++ b/nestedtensor/csrc/autograd_functions.cpp
@@ -206,6 +206,11 @@ Tensor NestedTensor_threshold_backward(
 }
 
 Tensor NestedTensor_dropout(const Tensor& input, double p, bool train) {
+  if (is_packed(input)) {
+    return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+        at::dropout(*get_nested_tensor_structure(input).buffer(), p, train),
+        get_nested_tensor_impl(input)->nested_size()));
+  }
   return autograd_map_nested_tensor(
       [&](const at::Tensor t) { return at::dropout(t, p, train); }, input);
 }
@@ -284,6 +289,19 @@ Tensor NestedTensor_add(const Tensor& self, const Tensor& other, Scalar alpha) {
     return map_nested_tensor(
         [&](at::Tensor o) { return at::add(self, o, alpha); }, other);
   }
+  if (is_packed(self) && self.dim() == 3 && other.dim() == 1) {
+#ifdef TRACEPACKED
+    std::cout << "calling packed add" << std::endl;
+#endif
+    auto self_structure = get_nested_tensor_structure(self);
+    auto self_impl = get_nested_tensor_impl(self);
+    return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+        (*self_structure.buffer())
+            .reshape({-1, other.size(0)})
+            .add(other)
+            .reshape({-1}),
+        self_impl->nested_size()));
+  }
   return map_nested_tensor(
       [&](at::Tensor s) { return at::add(s, other, alpha); }, self);
 }
diff --git a/nestedtensor/csrc/matmul.cpp b/nestedtensor/csrc/matmul.cpp
index 72ef3b5c..8a23a215 100644
--- a/nestedtensor/csrc/matmul.cpp
+++ b/nestedtensor/csrc/matmul.cpp
@@ -11,46 +11,46 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) {
   AutoGradMode autogradmode(false);
   auto impl_self = get_nested_tensor_impl(self);
   auto structure_self = get_nested_tensor_structure(self);
-  if (is_nested_tensor_impl(other)) {
-    auto impl_other = get_nested_tensor_impl(other);
-    auto structure_other = get_nested_tensor_structure(other);
-    if (structure_self.buffer() && structure_other.buffer() &&
-        self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] &&
-        impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] &&
-        impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] &&
-        impl_other->opt_sizes()[2] &&
-        (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) &&
-        (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) &&
-        (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) {
-#ifdef TRACEPACKED
-      std::cout << "calling packed NT x NT matmul" << std::endl;
-#endif
-      SizeNode new_nested_size = map(
-          [&](c10::List<int64_t> self_size, c10::List<int64_t> other_size) {
-            c10::List<int64_t> new_size{
-                self_size[0], self_size[1], other_size[2]};
-            return std::move(new_size);
-          },
-          impl_self->nested_size(),
-          impl_other->nested_size());
-      auto fn = [](c10::List<int64_t> leaf, int64_t input) {
-        return input + leaf[0] * leaf[1] * leaf[2];
-      };
-      int64_t new_numel = reduce<decltype(fn), int64_t, c10::List<int64_t>>(
-          new_nested_size, fn, 0);
-      Tensor new_buffer = at::empty({new_numel}, self.options());
-      Tensor result =
-          wrap_tensor_node(torch::nested_tensor::impl::build_structure(
-              std::move(new_buffer), new_nested_size));
-      apply_nested_tensor(
-          [](at::Tensor& result,
-             at::Tensor self,
-             at::Tensor other) { at::matmul_out(result, self, other); },
-          result,
-          self,
-          other);
-      return result;
-    }
+   if (is_nested_tensor_impl(other)) {
+//     auto impl_other = get_nested_tensor_impl(other);
+//     auto structure_other = get_nested_tensor_structure(other);
+//     if (structure_self.buffer() && structure_other.buffer() &&
+//         self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] &&
+//         impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] &&
+//         impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] &&
+//         impl_other->opt_sizes()[2] &&
+//         (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) &&
+//         (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) &&
+//         (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) {
+// #ifdef TRACEPACKED
+//       std::cout << "calling packed NT x NT matmul" << std::endl;
+// #endif
+//       SizeNode new_nested_size = map(
+//           [&](c10::List<int64_t> self_size, c10::List<int64_t> other_size) {
+//             c10::List<int64_t> new_size{
+//                 self_size[0], self_size[1], other_size[2]};
+//             return std::move(new_size);
+//           },
+//           impl_self->nested_size(),
+//           impl_other->nested_size());
+//       auto fn = [](c10::List<int64_t> leaf, int64_t input) {
+//         return input + leaf[0] * leaf[1] * leaf[2];
+//       };
+//       int64_t new_numel = reduce<decltype(fn), int64_t, c10::List<int64_t>>(
+//           new_nested_size, fn, 0);
+//       // Tensor new_buffer = at::empty({new_numel}, self.options());
+//       // Tensor result =
+//       //     wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+//       //         std::move(new_buffer), new_nested_size));
+//       return map_nested_tensor(
+//           [](//at::Tensor& result,
+//              at::Tensor self,
+//              at::Tensor other) { at::matmul(self, other); },
+//  //         result,
+//           self,
+//           other);
+// //      return result;
+//     }
     return map_nested_tensor(
         [](Tensor s, Tensor o) { return at::matmul(s, o); },
         self,
diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp
new file mode 100644
index 00000000..8441e49f
--- /dev/null
+++ b/nestedtensor/csrc/mha.cpp
@@ -0,0 +1,64 @@
+#include <nestedtensor/csrc/creation.h>
+#include <nestedtensor/csrc/nested_tensor_impl.h>
+#include <nestedtensor/csrc/python_functions.h>
+#include <nestedtensor/csrc/utils/nested_node_functions.h>
+#include <nestedtensor/csrc/utils/python_nested_node.h>
+#include <torch/csrc/Size.h>
+#include <torch/csrc/autograd/python_variable_indexing.h>
+#include <torch/extension.h>
+#include <chrono>
+namespace py = pybind11;
+
+using namespace torch::nested_tensor;
+using namespace at;
+
+namespace torch {
+namespace nested_tensor {
+
+at::Tensor min_mha(
+    int64_t num_heads,
+    int64_t head_dim,
+    double dropout_p,
+    bool training,
+    at::Tensor query,
+    at::Tensor key,
+    at::Tensor value,
+    at::Tensor in_proj_weight,
+    c10::optional<at::Tensor> in_proj_bias,
+    double scaling,
+    at::Tensor out_proj_weight,
+    at::Tensor out_proj_bias) {
+  TORCH_CHECK(query.dim() == 3, "query needs to be 3 dim.");
+  TORCH_CHECK(key.dim() == 3, "key needs to be 3 dim.");
+  TORCH_CHECK(value.dim() == 3, "value needs to be 3 dim.");
+  int64_t edim = query.size(2);
+
+  auto q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim));
+  auto k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim));
+  auto v = at::matmul(value, at::slice(in_proj_weight, 0, 2 * edim));
+  if (in_proj_bias) {
+    q = q + at::slice(*in_proj_bias, 0, 0, edim);
+    k = k + at::slice(*in_proj_bias, 0, edim, 2 * edim);
+    v = v + at::slice(*in_proj_bias, 0, 2 * edim);
+  }
+
+  q = at::mul(q, torch::tensor({scaling}, q.options()));
+
+  q = q.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
+  k = k.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
+  v = v.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
+  auto attn_output_weights = at::matmul(q, k.transpose(2, 3));
+  attn_output_weights = at::softmax(attn_output_weights, -1).contiguous();
+  attn_output_weights = at::dropout(attn_output_weights, dropout_p, training);
+  auto attn_output = at::matmul(attn_output_weights, v);
+  attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim}).contiguous();
+  attn_output = at::matmul(attn_output, out_proj_weight);
+  attn_output += out_proj_bias;
+  return attn_output;
+}
+
+static auto registry =
+    torch::RegisterOperators().op("nestedtensor::min_mha", &min_mha);
+
+} // namespace nested_tensor
+} // namespace torch
diff --git a/nestedtensor/csrc/mha.h b/nestedtensor/csrc/mha.h
new file mode 100644
index 00000000..e69de29b
diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp
index d2a1579d..c696738b 100644
--- a/nestedtensor/csrc/py_init.cpp
+++ b/nestedtensor/csrc/py_init.cpp
@@ -7,6 +7,7 @@
 #include <torch/csrc/autograd/python_variable_indexing.h>
 #include <torch/extension.h>
 #include <chrono>
+#include <nestedtensor/csrc/mha.h>
 
 // NOTE: A NestedTensor without any constituents, i.e.
 // nested_tensor([]) is of dimension 1 because
diff --git a/nestedtensor/csrc/utils/nested_node.h b/nestedtensor/csrc/utils/nested_node.h
index 7a791043..6f2b9f4b 100644
--- a/nestedtensor/csrc/utils/nested_node.h
+++ b/nestedtensor/csrc/utils/nested_node.h
@@ -402,6 +402,7 @@ inline TensorNode build_structure(
 inline TensorNode build_structure(
     at::Tensor&& buffer,
     const SizeNode& nested_size) {
+  TORCH_CHECK(buffer.dim() == 1, "Given buffer must be vector, i.e. dim 1 Tensor.");
   SizeNode nested_stride = map(
       [](c10::List<int64_t> size) { return _cont_stride(size); }, nested_size);
   return build_structure(std::move(buffer), nested_size, nested_stride);
diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py
index d076a28b..ebe89f1b 100644
--- a/nestedtensor/nn/mha.py
+++ b/nestedtensor/nn/mha.py
@@ -69,7 +69,23 @@ def multi_head_attention_forward(query,                           # type: Nested
     head_dim = embed_dim // num_heads
     assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
     scaling = float(head_dim) ** -0.5
-
+    # print(query.nested_size())
+    # print(key.nested_size())
+    # print(value.nested_size())
+    # print(in_proj_bias.size())
+
+    return torch.ops.nestedtensor.min_mha(num_heads,
+                                          head_dim,
+                                          dropout_p,
+                                          training,
+                                          query._impl,
+                                          key._impl,
+                                          value._impl,
+                                          in_proj_weight,
+                                          in_proj_bias,
+                                          scaling,
+                                          out_proj_weight,
+                                          out_proj_bias), None
 
     # This is inline in_proj function with in_proj_weight and in_proj_bias
     _b = in_proj_bias
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 5c8ced47..b17166b5 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev202082421+93f26bc'
-git_version = '93f26bcc6cd53fbf7644909e148efcdf2f6f49a3'
+__version__ = '0.0.1.dev20208251+17bf81d'
+git_version = '17bf81da887e618d3931b229795b6320b26b3f78'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From eec9dfdf38998aa76b6182a60bdef4df6eb7796f Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 19:28:59 -0700
Subject: [PATCH 02/12] Checkpoint

---
 benchmarks/mha.py            |   3 +-
 benchmarks/utils.py          |   2 +-
 nestedtensor/csrc/matmul.cpp | 109 +++++++++++++++++++++--------------
 nestedtensor/csrc/mha.cpp    |   1 +
 nestedtensor/nn/mha.py       |   8 +--
 nestedtensor/version.py      |   4 +-
 6 files changed, 76 insertions(+), 51 deletions(-)

diff --git a/benchmarks/mha.py b/benchmarks/mha.py
index d0f3ba58..9d117260 100644
--- a/benchmarks/mha.py
+++ b/benchmarks/mha.py
@@ -6,8 +6,9 @@
 import random
 
 # Performance tanks hard for lots of small Tensors as expected
+random.seed(1010)
 RAND_INTS = [random.randint(10, 30) for _ in range(2000)]
-RAND_INTS = [random.randint(100, 300) for _ in range(20)]
+RAND_INTS = [random.randint(100, 300) for _ in range(2)]
 
 # (26, None, 256) (26, None, 256) (26, None, 256) torch.Size([256, 256]) torch.Size([256])
 MODEL0 = torch.nn.MultiheadAttention(256, 8, dropout=0.1).cuda()
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index 276f1ce3..0d1991fd 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -30,7 +30,7 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False)
         fn()
         # if t > 1:
         #     import sys; sys.exit(1)
-        # import sys; sys.exit(1)
+        import sys; sys.exit(1)
         if cuda:
             torch.cuda.synchronize()
         if use_cprofile:
diff --git a/nestedtensor/csrc/matmul.cpp b/nestedtensor/csrc/matmul.cpp
index 8a23a215..33c0d132 100644
--- a/nestedtensor/csrc/matmul.cpp
+++ b/nestedtensor/csrc/matmul.cpp
@@ -11,50 +11,73 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) {
   AutoGradMode autogradmode(false);
   auto impl_self = get_nested_tensor_impl(self);
   auto structure_self = get_nested_tensor_structure(self);
-   if (is_nested_tensor_impl(other)) {
-//     auto impl_other = get_nested_tensor_impl(other);
-//     auto structure_other = get_nested_tensor_structure(other);
-//     if (structure_self.buffer() && structure_other.buffer() &&
-//         self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] &&
-//         impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] &&
-//         impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] &&
-//         impl_other->opt_sizes()[2] &&
-//         (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) &&
-//         (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) &&
-//         (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) {
-// #ifdef TRACEPACKED
-//       std::cout << "calling packed NT x NT matmul" << std::endl;
-// #endif
-//       SizeNode new_nested_size = map(
-//           [&](c10::List<int64_t> self_size, c10::List<int64_t> other_size) {
-//             c10::List<int64_t> new_size{
-//                 self_size[0], self_size[1], other_size[2]};
-//             return std::move(new_size);
-//           },
-//           impl_self->nested_size(),
-//           impl_other->nested_size());
-//       auto fn = [](c10::List<int64_t> leaf, int64_t input) {
-//         return input + leaf[0] * leaf[1] * leaf[2];
-//       };
-//       int64_t new_numel = reduce<decltype(fn), int64_t, c10::List<int64_t>>(
-//           new_nested_size, fn, 0);
-//       // Tensor new_buffer = at::empty({new_numel}, self.options());
-//       // Tensor result =
-//       //     wrap_tensor_node(torch::nested_tensor::impl::build_structure(
-//       //         std::move(new_buffer), new_nested_size));
-//       return map_nested_tensor(
-//           [](//at::Tensor& result,
-//              at::Tensor self,
-//              at::Tensor other) { at::matmul(self, other); },
-//  //         result,
-//           self,
-//           other);
-// //      return result;
-//     }
+  if (is_nested_tensor_impl(other)) {
+    auto impl_other = get_nested_tensor_impl(other);
+    auto structure_other = get_nested_tensor_structure(other);
+    if (structure_self.buffer() && structure_other.buffer() &&
+        self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] &&
+        impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] &&
+        impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] &&
+        impl_other->opt_sizes()[2] &&
+        (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) &&
+        (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) &&
+        (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) {
+#ifdef TRACEPACKED
+      std::cout << "calling packed NT x NT matmul" << std::endl;
+#endif
+      SizeNode new_nested_size = map(
+          [&](c10::List<int64_t> self_size, c10::List<int64_t> other_size) {
+            std::cout << "self_size: " << self_size[0] << " " << self_size[1]
+                      << " " << self_size[2] << std::endl;
+            std::cout << "other_size: " << other_size[0] << " " << other_size[1]
+                      << " " << other_size[2] << std::endl;
+            c10::List<int64_t> new_size{
+                self_size[0], self_size[1], other_size[2]};
+            std::cout << "new_size: " << new_size[0] << " " << new_size[1]
+                      << " " << new_size[2] << std::endl;
+            return std::move(new_size);
+          },
+          impl_self->nested_size(),
+          impl_other->nested_size());
+      auto self_buffer = *structure_self.buffer();
+      auto other_buffer = *structure_other.buffer();
+      self_buffer = self_buffer.reshape({self.size(1), -1, self.size(3)});
+      other_buffer = other_buffer.reshape({self.size(1), other.size(2), -1});
+      std::cout << self_buffer.sizes() << std::endl;
+      std::cout << other_buffer.sizes() << std::endl;
+      auto result_buffer = at::bmm(self_buffer, other_buffer);
+      std::cout << result_buffer.sizes() << std::endl;
+      result_buffer = result_buffer.reshape({-1});
+      auto rr = map_nested_tensor(
+          [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other);
+      std::cout << "rr.numel(): " << rr.numel() << std::endl;
+      apply_nested_tensor(
+          [](at::Tensor a) {
+            std::cout << "a.sizes(): " << a.sizes() << std::endl;
+          },
+          rr);
+      return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+          std::move(result_buffer), new_nested_size));
+      // auto fn = [](c10::List<int64_t> leaf, int64_t input) {
+      //   return input + leaf[0] * leaf[1] * leaf[2];
+      // };
+      // int64_t new_numel = reduce<decltype(fn), int64_t, c10::List<int64_t>>(
+      //     new_nested_size, fn, 0);
+      // Tensor new_buffer = at::empty({new_numel}, self.options());
+      // Tensor result =
+      //     wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+      //         std::move(new_buffer), new_nested_size));
+      //        return map_nested_tensor(
+      //            [](//at::Tensor& result,
+      //               at::Tensor self,
+      //               at::Tensor other) { at::matmul(self, other); },
+      //   //         result,
+      //            self,
+      //            other);
+      //  //      return result;
+    }
     return map_nested_tensor(
-        [](Tensor s, Tensor o) { return at::matmul(s, o); },
-        self,
-        other);
+        [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other);
   }
   if (structure_self.buffer()) {
     if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] &&
diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp
index 8441e49f..b46fdc5d 100644
--- a/nestedtensor/csrc/mha.cpp
+++ b/nestedtensor/csrc/mha.cpp
@@ -33,6 +33,7 @@ at::Tensor min_mha(
   TORCH_CHECK(value.dim() == 3, "value needs to be 3 dim.");
   int64_t edim = query.size(2);
 
+  //TODO: Use addmm!
   auto q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim));
   auto k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim));
   auto v = at::matmul(value, at::slice(in_proj_weight, 0, 2 * edim));
diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py
index ebe89f1b..64156b2c 100644
--- a/nestedtensor/nn/mha.py
+++ b/nestedtensor/nn/mha.py
@@ -69,10 +69,10 @@ def multi_head_attention_forward(query,                           # type: Nested
     head_dim = embed_dim // num_heads
     assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
     scaling = float(head_dim) ** -0.5
-    # print(query.nested_size())
-    # print(key.nested_size())
-    # print(value.nested_size())
-    # print(in_proj_bias.size())
+    print(query.nested_size())
+    print(key.nested_size())
+    print(value.nested_size())
+    print(in_proj_bias.size())
 
     return torch.ops.nestedtensor.min_mha(num_heads,
                                           head_dim,
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index b17166b5..517ca509 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20208251+17bf81d'
-git_version = '17bf81da887e618d3931b229795b6320b26b3f78'
+__version__ = '0.0.1.dev20208252+7804c5c'
+git_version = '7804c5c437b1e5f4736f4b01d624873d04940983'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From cae003de0c518bcb7f24f58f3c80e20e49a41f6a Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 20:55:58 -0700
Subject: [PATCH 03/12] Checkpoint

---
 benchmarks/mha.py                             |   4 +-
 benchmarks/utils.py                           |   2 +-
 nestedtensor/csrc/matmul.cpp                  | 153 ++++++++++--------
 nestedtensor/csrc/mha.cpp                     |  42 +++--
 nestedtensor/nn/mha.py                        |  14 +-
 nestedtensor/version.py                       |   4 +-
 .../test_nested_tensor_autograd_functional.py |   3 +
 7 files changed, 135 insertions(+), 87 deletions(-)

diff --git a/benchmarks/mha.py b/benchmarks/mha.py
index 9d117260..d07dc516 100644
--- a/benchmarks/mha.py
+++ b/benchmarks/mha.py
@@ -8,7 +8,7 @@
 # Performance tanks hard for lots of small Tensors as expected
 random.seed(1010)
 RAND_INTS = [random.randint(10, 30) for _ in range(2000)]
-RAND_INTS = [random.randint(100, 300) for _ in range(2)]
+RAND_INTS = [random.randint(100, 300) for _ in range(20)]
 
 # (26, None, 256) (26, None, 256) (26, None, 256) torch.Size([256, 256]) torch.Size([256])
 MODEL0 = torch.nn.MultiheadAttention(256, 8, dropout=0.1).cuda()
@@ -34,4 +34,4 @@ def nt():
 
 if __name__ == "__main__":
     print(utils.benchmark_fn(gen_nt_segmentation()))
-    # print(utils.benchmark_fn(gen_t_loop_segmentation()))
+    print(utils.benchmark_fn(gen_t_loop_segmentation()))
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index 0d1991fd..276f1ce3 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -30,7 +30,7 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False)
         fn()
         # if t > 1:
         #     import sys; sys.exit(1)
-        import sys; sys.exit(1)
+        # import sys; sys.exit(1)
         if cuda:
             torch.cuda.synchronize()
         if use_cprofile:
diff --git a/nestedtensor/csrc/matmul.cpp b/nestedtensor/csrc/matmul.cpp
index 33c0d132..902bec3c 100644
--- a/nestedtensor/csrc/matmul.cpp
+++ b/nestedtensor/csrc/matmul.cpp
@@ -14,77 +14,90 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) {
   if (is_nested_tensor_impl(other)) {
     auto impl_other = get_nested_tensor_impl(other);
     auto structure_other = get_nested_tensor_structure(other);
-    if (structure_self.buffer() && structure_other.buffer() &&
-        self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] &&
-        impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] &&
-        impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] &&
-        impl_other->opt_sizes()[2] &&
-        (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) &&
-        (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) &&
-        (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) {
-#ifdef TRACEPACKED
-      std::cout << "calling packed NT x NT matmul" << std::endl;
-#endif
-      SizeNode new_nested_size = map(
-          [&](c10::List<int64_t> self_size, c10::List<int64_t> other_size) {
-            std::cout << "self_size: " << self_size[0] << " " << self_size[1]
-                      << " " << self_size[2] << std::endl;
-            std::cout << "other_size: " << other_size[0] << " " << other_size[1]
-                      << " " << other_size[2] << std::endl;
-            c10::List<int64_t> new_size{
-                self_size[0], self_size[1], other_size[2]};
-            std::cout << "new_size: " << new_size[0] << " " << new_size[1]
-                      << " " << new_size[2] << std::endl;
-            return std::move(new_size);
-          },
-          impl_self->nested_size(),
-          impl_other->nested_size());
-      auto self_buffer = *structure_self.buffer();
-      auto other_buffer = *structure_other.buffer();
-      self_buffer = self_buffer.reshape({self.size(1), -1, self.size(3)});
-      other_buffer = other_buffer.reshape({self.size(1), other.size(2), -1});
-      std::cout << self_buffer.sizes() << std::endl;
-      std::cout << other_buffer.sizes() << std::endl;
-      auto result_buffer = at::bmm(self_buffer, other_buffer);
-      std::cout << result_buffer.sizes() << std::endl;
-      result_buffer = result_buffer.reshape({-1});
-      auto rr = map_nested_tensor(
-          [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other);
-      std::cout << "rr.numel(): " << rr.numel() << std::endl;
-      apply_nested_tensor(
-          [](at::Tensor a) {
-            std::cout << "a.sizes(): " << a.sizes() << std::endl;
-          },
-          rr);
-      return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
-          std::move(result_buffer), new_nested_size));
-      // auto fn = [](c10::List<int64_t> leaf, int64_t input) {
-      //   return input + leaf[0] * leaf[1] * leaf[2];
-      // };
-      // int64_t new_numel = reduce<decltype(fn), int64_t, c10::List<int64_t>>(
-      //     new_nested_size, fn, 0);
-      // Tensor new_buffer = at::empty({new_numel}, self.options());
-      // Tensor result =
-      //     wrap_tensor_node(torch::nested_tensor::impl::build_structure(
-      //         std::move(new_buffer), new_nested_size));
-      //        return map_nested_tensor(
-      //            [](//at::Tensor& result,
-      //               at::Tensor self,
-      //               at::Tensor other) { at::matmul(self, other); },
-      //   //         result,
-      //            self,
-      //            other);
-      //  //      return result;
-    }
+//    if (structure_self.buffer() && structure_other.buffer() &&
+//        self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] &&
+//        impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] &&
+//        impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] &&
+//        impl_other->opt_sizes()[2] &&
+//        (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) &&
+//        (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) &&
+//        (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) {
+//#ifdef TRACEPACKED
+//      std::cout << "calling packed NT x NT matmul" << std::endl;
+//#endif
+//      SizeNode new_nested_size = map(
+//          [&](c10::List<int64_t> self_size, c10::List<int64_t> other_size) {
+//            c10::List<int64_t> new_size{
+//                self_size[0], self_size[1], other_size[2]};
+//            return std::move(new_size);
+//          },
+//          impl_self->nested_size(),
+//          impl_other->nested_size());
+//      auto fn = [](c10::List<int64_t> leaf, int64_t input) {
+//        return input + leaf[0] * leaf[1] * leaf[2];
+//      };
+//      int64_t new_numel = reduce<decltype(fn), int64_t, c10::List<int64_t>>(
+//          new_nested_size, fn, 0);
+//      Tensor new_buffer = at::empty({new_numel}, self.options());
+//      Tensor result =
+//          wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+//              std::move(new_buffer), new_nested_size));
+//      apply_nested_tensor(
+//          [](at::Tensor& result, at::Tensor self, at::Tensor other) {
+//            at::matmul_out(result, self, other);
+//          },
+//          result,
+//          self,
+//          other);
+//      return result;
+//    }
     return map_nested_tensor(
         [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other);
   }
+//  if (structure_self.buffer()) {
+//    if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] &&
+//        impl_self->opt_sizes()[2] &&
+//        impl_self->opt_sizes()[self.dim() - 1] == other.size(self.dim() - 2)) {
+//#ifdef TRACEPACKED
+//      std::cout << "calling packed NT x T matmul" << std::endl;
+//#endif
+//      SizeNode new_nested_size = map(
+//          [&](c10::List<int64_t> self_size) {
+//            c10::List<int64_t> new_size{self_size[0], other.size(1)};
+//            return std::move(new_size);
+//          },
+//          impl_self->nested_size());
+//      return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+//          at::matmul(
+//              (*structure_self.buffer()).reshape({-1, other.size(0)}), other)
+//              .reshape(-1),
+//          new_nested_size));
+//    }
+//  }
+  return map_nested_tensor(
+      [&other](Tensor tensor) { return at::matmul(tensor, other); }, self);
+}
+
+Tensor NestedTensor_addmm(
+    const Tensor& input,
+    const Tensor& self,
+    const Tensor& other,
+    c10::Scalar alpha,
+    c10::Scalar beta) {
+  AutoGradMode autogradmode(false);
+  TORCH_CHECK(!is_nested_tensor_impl(input), "input must be Tensor");
+  TORCH_CHECK(!is_nested_tensor_impl(other), "other must be Tensor");
+  TORCH_CHECK(is_nested_tensor_impl(self), "self must be NestedTensor");
+  // TORCH_CHECK(alpha == 1, "alpha must be 1.");
+  // TORCH_CHECK(beta == 1, "beta must be 1.");
+  auto impl_self = get_nested_tensor_impl(self);
+  auto structure_self = get_nested_tensor_structure(self);
   if (structure_self.buffer()) {
     if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] &&
         impl_self->opt_sizes()[2] &&
         impl_self->opt_sizes()[self.dim() - 1] == other.size(self.dim() - 2)) {
 #ifdef TRACEPACKED
-      std::cout << "calling packed NT x T matmul" << std::endl;
+      std::cout << "calling packed T x NT x T addmm" << std::endl;
 #endif
       SizeNode new_nested_size = map(
           [&](c10::List<int64_t> self_size) {
@@ -93,14 +106,21 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) {
           },
           impl_self->nested_size());
       return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
-          at::matmul(
-              (*structure_self.buffer()).reshape({-1, other.size(0)}), other)
+          at::addmm(
+              input,
+              (*structure_self.buffer()).reshape({-1, other.size(0)}),
+              other,
+              alpha,
+              beta)
               .reshape(-1),
           new_nested_size));
     }
   }
   return map_nested_tensor(
-      [&other](Tensor tensor) { return at::matmul(tensor, other); }, self);
+      [&](Tensor tensor) {
+        return at::addmm(input, tensor, other, alpha, beta);
+      },
+      self);
 }
 
 Tensor& NestedTensor_matmul_out(
@@ -120,6 +140,7 @@ Tensor& NestedTensor_matmul_out(
 
 TORCH_LIBRARY_IMPL(aten, PrivateUse1_PreAutograd, m) {
   // nt_impl(m, "matmul", no_bw(TORCH_FN(NestedTensor_matmul);
+  nt_impl(m, "addmm", NestedTensor_addmm);
   nt_impl(m, "matmul", NestedTensor_matmul);
   nt_impl(m, "matmul.out", NestedTensor_matmul_out);
 }
diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp
index b46fdc5d..3083fa11 100644
--- a/nestedtensor/csrc/mha.cpp
+++ b/nestedtensor/csrc/mha.cpp
@@ -33,27 +33,45 @@ at::Tensor min_mha(
   TORCH_CHECK(value.dim() == 3, "value needs to be 3 dim.");
   int64_t edim = query.size(2);
 
-  //TODO: Use addmm!
-  auto q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim));
-  auto k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim));
-  auto v = at::matmul(value, at::slice(in_proj_weight, 0, 2 * edim));
+  at::Tensor q, k, v;
+  // TODO: Use addmm!
+  // if input.dim() == 2 and bias is not None:
+  //     # fused op is marginally faster
+  //     ret = torch.addmm(bias, input, weight.t())
   if (in_proj_bias) {
-    q = q + at::slice(*in_proj_bias, 0, 0, edim);
-    k = k + at::slice(*in_proj_bias, 0, edim, 2 * edim);
-    v = v + at::slice(*in_proj_bias, 0, 2 * edim);
+    q = at::addmm(
+        at::slice(*in_proj_bias, 0, 0, edim),
+        query,
+        at::slice(in_proj_weight, 0, 0, edim).t(),
+        scaling,
+        scaling);
+    k = at::addmm(
+        at::slice(*in_proj_bias, 0, edim, 2 * edim),
+        key,
+        at::slice(in_proj_weight, 0, edim, 2 * edim).t());
+    v = at::addmm(
+        at::slice(*in_proj_bias, 0, 2 * edim),
+        value,
+        at::slice(in_proj_weight, 0, 2 * edim).t());
+    //    q = q + at::slice(*in_proj_bias, 0, 0, edim);
+    //    k = k + at::slice(*in_proj_bias, 0, edim, 2 * edim);
+    //    v = v + at::slice(*in_proj_bias, 0, 2 * edim);
+  } else {
+    q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim).t());
+    k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim).t());
+    v = at::matmul(value, at::slice(in_proj_weight, 0, 2 * edim).t());
+    q = at::mul(q, torch::tensor({scaling}, q.options()));
   }
 
-  q = at::mul(q, torch::tensor({scaling}, q.options()));
-
   q = q.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
   k = k.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
   v = v.reshape({-1, -1, num_heads, head_dim}).transpose(1, 2);
   auto attn_output_weights = at::matmul(q, k.transpose(2, 3));
-  attn_output_weights = at::softmax(attn_output_weights, -1).contiguous();
+  attn_output_weights = at::softmax(attn_output_weights, -1);
   attn_output_weights = at::dropout(attn_output_weights, dropout_p, training);
   auto attn_output = at::matmul(attn_output_weights, v);
-  attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim}).contiguous();
-  attn_output = at::matmul(attn_output, out_proj_weight);
+  attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim});
+  attn_output = at::matmul(attn_output, out_proj_weight.t());
   attn_output += out_proj_bias;
   return attn_output;
 }
diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py
index 64156b2c..a84000f9 100644
--- a/nestedtensor/nn/mha.py
+++ b/nestedtensor/nn/mha.py
@@ -47,6 +47,9 @@ def multi_head_attention_forward(query,                           # type: Nested
     assert isinstance(query, nestedtensor.NestedTensor)
     assert isinstance(key, nestedtensor.NestedTensor)
     assert isinstance(value, nestedtensor.NestedTensor)
+    query = query.contiguous()
+    key = key.contiguous()
+    value = value.contiguous()
     assert torch.is_tensor(out_proj_weight)
     assert torch.is_tensor(out_proj_bias)
 
@@ -69,10 +72,10 @@ def multi_head_attention_forward(query,                           # type: Nested
     head_dim = embed_dim // num_heads
     assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
     scaling = float(head_dim) ** -0.5
-    print(query.nested_size())
-    print(key.nested_size())
-    print(value.nested_size())
-    print(in_proj_bias.size())
+    # print(query.nested_size())
+    # print(key.nested_size())
+    # print(value.nested_size())
+    # print(in_proj_bias.size())
 
     return torch.ops.nestedtensor.min_mha(num_heads,
                                           head_dim,
@@ -92,9 +95,11 @@ def multi_head_attention_forward(query,                           # type: Nested
     _start = 0
     _end = embed_dim
     _w = in_proj_weight[_start:_end, :]
+    print(_w.sum())
     if _b is not None:
         _b = _b[_start:_end]
     q = F.linear(query, _w, _b)
+    print(q.sum())
 
     # This is inline in_proj function with in_proj_weight and in_proj_bias
     _b = in_proj_bias
@@ -114,6 +119,7 @@ def multi_head_attention_forward(query,                           # type: Nested
         _b = _b[_start:]
     v = F.linear(value, _w, _b)
     q = q * scaling
+    print(q.sum())
 
     # NOTE: This is usually contiguous plus a view
     q = q.reshape(-1, -1, num_heads, head_dim).transpose(1, 2)
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 517ca509..5dbadc67 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20208252+7804c5c'
-git_version = '7804c5c437b1e5f4736f4b01d624873d04940983'
+__version__ = '0.0.1.dev20208253+eec9dfd'
+git_version = 'eec9dfdf38998aa76b6182a60bdef4df6eb7796f'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION
diff --git a/test/test_nested_tensor_autograd_functional.py b/test/test_nested_tensor_autograd_functional.py
index b41d5849..7376beef 100644
--- a/test/test_nested_tensor_autograd_functional.py
+++ b/test/test_nested_tensor_autograd_functional.py
@@ -224,6 +224,7 @@ def _test(FCNHead):
     def test_mha(self):
         embed_dim = 2
         num_heads = 2
+        torch.manual_seed(1010)
         mha = torch.nn.MultiheadAttention(embed_dim, num_heads)
         query = torch.randn(3, 1, embed_dim)
         key = torch.randn(2, 1, embed_dim)
@@ -241,6 +242,8 @@ def test_mha(self):
             query_nt, key_nt, value_nt, need_weights=False)
         # nt_attn_output.sum().backward()
         # For regular tensors the batch dimension is along dimension 1
+        print(attn_output.sum())
+        print(nt_attn_output.sum())
         self.assertEqual(attn_output.squeeze(1), nt_attn_output[0])
 
 

From ecbbf50c9021d97d2f83d3e6d704f1754b4557fd Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 20:59:51 -0700
Subject: [PATCH 04/12] Checkpoint

---
 benchmarks/utils.py                      |   3 -
 nestedtensor/csrc/autograd_functions.cpp |   5 -
 nestedtensor/csrc/matmul.cpp             | 150 ++++++++++++-----------
 3 files changed, 76 insertions(+), 82 deletions(-)

diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index 276f1ce3..d1268602 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -28,9 +28,6 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False)
         if use_cprofile:
             pr.enable()
         fn()
-        # if t > 1:
-        #     import sys; sys.exit(1)
-        # import sys; sys.exit(1)
         if cuda:
             torch.cuda.synchronize()
         if use_cprofile:
diff --git a/nestedtensor/csrc/autograd_functions.cpp b/nestedtensor/csrc/autograd_functions.cpp
index fdf5d8f5..5018dd82 100644
--- a/nestedtensor/csrc/autograd_functions.cpp
+++ b/nestedtensor/csrc/autograd_functions.cpp
@@ -206,11 +206,6 @@ Tensor NestedTensor_threshold_backward(
 }
 
 Tensor NestedTensor_dropout(const Tensor& input, double p, bool train) {
-  if (is_packed(input)) {
-    return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
-        at::dropout(*get_nested_tensor_structure(input).buffer(), p, train),
-        get_nested_tensor_impl(input)->nested_size()));
-  }
   return autograd_map_nested_tensor(
       [&](const at::Tensor t) { return at::dropout(t, p, train); }, input);
 }
diff --git a/nestedtensor/csrc/matmul.cpp b/nestedtensor/csrc/matmul.cpp
index 902bec3c..d52b5b16 100644
--- a/nestedtensor/csrc/matmul.cpp
+++ b/nestedtensor/csrc/matmul.cpp
@@ -14,70 +14,87 @@ Tensor NestedTensor_matmul(const Tensor& self, const Tensor& other) {
   if (is_nested_tensor_impl(other)) {
     auto impl_other = get_nested_tensor_impl(other);
     auto structure_other = get_nested_tensor_structure(other);
-//    if (structure_self.buffer() && structure_other.buffer() &&
-//        self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] &&
-//        impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] &&
-//        impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] &&
-//        impl_other->opt_sizes()[2] &&
-//        (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) &&
-//        (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) &&
-//        (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) {
-//#ifdef TRACEPACKED
-//      std::cout << "calling packed NT x NT matmul" << std::endl;
-//#endif
-//      SizeNode new_nested_size = map(
-//          [&](c10::List<int64_t> self_size, c10::List<int64_t> other_size) {
-//            c10::List<int64_t> new_size{
-//                self_size[0], self_size[1], other_size[2]};
-//            return std::move(new_size);
-//          },
-//          impl_self->nested_size(),
-//          impl_other->nested_size());
-//      auto fn = [](c10::List<int64_t> leaf, int64_t input) {
-//        return input + leaf[0] * leaf[1] * leaf[2];
-//      };
-//      int64_t new_numel = reduce<decltype(fn), int64_t, c10::List<int64_t>>(
-//          new_nested_size, fn, 0);
-//      Tensor new_buffer = at::empty({new_numel}, self.options());
-//      Tensor result =
-//          wrap_tensor_node(torch::nested_tensor::impl::build_structure(
-//              std::move(new_buffer), new_nested_size));
-//      apply_nested_tensor(
-//          [](at::Tensor& result, at::Tensor self, at::Tensor other) {
-//            at::matmul_out(result, self, other);
-//          },
-//          result,
-//          self,
-//          other);
-//      return result;
-//    }
+    if (structure_self.buffer() && structure_other.buffer() &&
+        self.dim() == 4 && other.dim() == 4 && impl_self->opt_sizes()[0] &&
+        impl_other->opt_sizes()[0] && impl_self->opt_sizes()[1] &&
+        impl_other->opt_sizes()[1] && impl_self->opt_sizes()[3] &&
+        impl_other->opt_sizes()[2] &&
+        (*impl_self->opt_sizes()[0] == *impl_other->opt_sizes()[0]) &&
+        (*impl_self->opt_sizes()[1] == *impl_other->opt_sizes()[1]) &&
+        (*impl_self->opt_sizes()[3] == *impl_other->opt_sizes()[2])) {
+#ifdef TRACEPACKED
+      std::cout << "calling packed NT x NT matmul" << std::endl;
+#endif
+      SizeNode new_nested_size = map(
+          [&](c10::List<int64_t> self_size, c10::List<int64_t> other_size) {
+            c10::List<int64_t> new_size{
+                self_size[0], self_size[1], other_size[2]};
+            return std::move(new_size);
+          },
+          impl_self->nested_size(),
+          impl_other->nested_size());
+      auto fn = [](c10::List<int64_t> leaf, int64_t input) {
+        return input + leaf[0] * leaf[1] * leaf[2];
+      };
+      int64_t new_numel = reduce<decltype(fn), int64_t, c10::List<int64_t>>(
+          new_nested_size, fn, 0);
+      Tensor new_buffer = at::empty({new_numel}, self.options());
+      Tensor result =
+          wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+              std::move(new_buffer), new_nested_size));
+      apply_nested_tensor(
+          [](at::Tensor& result,
+             at::Tensor self,
+             at::Tensor other) { at::matmul_out(result, self, other); },
+          result,
+          self,
+          other);
+      return result;
+    }
     return map_nested_tensor(
-        [](Tensor s, Tensor o) { return at::matmul(s, o); }, self, other);
+        [](Tensor s, Tensor o) { return at::matmul(s, o); },
+        self,
+        other);
+  }
+  if (structure_self.buffer()) {
+    if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] &&
+        impl_self->opt_sizes()[2] &&
+        impl_self->opt_sizes()[self.dim() - 1] == other.size(self.dim() - 2)) {
+#ifdef TRACEPACKED
+      std::cout << "calling packed NT x T matmul" << std::endl;
+#endif
+      SizeNode new_nested_size = map(
+          [&](c10::List<int64_t> self_size) {
+            c10::List<int64_t> new_size{self_size[0], other.size(1)};
+            return std::move(new_size);
+          },
+          impl_self->nested_size());
+      return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+          at::matmul(
+              (*structure_self.buffer()).reshape({-1, other.size(0)}), other)
+              .reshape(-1),
+          new_nested_size));
+    }
   }
-//  if (structure_self.buffer()) {
-//    if (self.dim() == 3 && other.dim() == 2 && impl_self->opt_sizes()[0] &&
-//        impl_self->opt_sizes()[2] &&
-//        impl_self->opt_sizes()[self.dim() - 1] == other.size(self.dim() - 2)) {
-//#ifdef TRACEPACKED
-//      std::cout << "calling packed NT x T matmul" << std::endl;
-//#endif
-//      SizeNode new_nested_size = map(
-//          [&](c10::List<int64_t> self_size) {
-//            c10::List<int64_t> new_size{self_size[0], other.size(1)};
-//            return std::move(new_size);
-//          },
-//          impl_self->nested_size());
-//      return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
-//          at::matmul(
-//              (*structure_self.buffer()).reshape({-1, other.size(0)}), other)
-//              .reshape(-1),
-//          new_nested_size));
-//    }
-//  }
   return map_nested_tensor(
       [&other](Tensor tensor) { return at::matmul(tensor, other); }, self);
 }
 
+Tensor& NestedTensor_matmul_out(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& other) {
+  AutoGradMode autogradmode(false);
+  apply_nested_tensor(
+      [](Tensor& result, Tensor& tensor, Tensor& other) {
+        return at::matmul_out(result, tensor, other);
+      },
+      result,
+      self,
+      other);
+  return result;
+}
+
 Tensor NestedTensor_addmm(
     const Tensor& input,
     const Tensor& self,
@@ -123,24 +140,9 @@ Tensor NestedTensor_addmm(
       self);
 }
 
-Tensor& NestedTensor_matmul_out(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& other) {
-  AutoGradMode autogradmode(false);
-  apply_nested_tensor(
-      [](Tensor& result, Tensor& tensor, Tensor& other) {
-        return at::matmul_out(result, tensor, other);
-      },
-      result,
-      self,
-      other);
-  return result;
-}
-
 TORCH_LIBRARY_IMPL(aten, PrivateUse1_PreAutograd, m) {
-  // nt_impl(m, "matmul", no_bw(TORCH_FN(NestedTensor_matmul);
   nt_impl(m, "addmm", NestedTensor_addmm);
+  // nt_impl(m, "matmul", no_bw(TORCH_FN(NestedTensor_matmul);
   nt_impl(m, "matmul", NestedTensor_matmul);
   nt_impl(m, "matmul.out", NestedTensor_matmul_out);
 }

From 768d2258f61caee8ee45978fbff2489383c68d59 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 21:00:50 -0700
Subject: [PATCH 05/12] Checkpoint

---
 nestedtensor/nn/mha.py | 51 ------------------------------------------
 1 file changed, 51 deletions(-)

diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py
index a84000f9..868180e8 100644
--- a/nestedtensor/nn/mha.py
+++ b/nestedtensor/nn/mha.py
@@ -72,10 +72,6 @@ def multi_head_attention_forward(query,                           # type: Nested
     head_dim = embed_dim // num_heads
     assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
     scaling = float(head_dim) ** -0.5
-    # print(query.nested_size())
-    # print(key.nested_size())
-    # print(value.nested_size())
-    # print(in_proj_bias.size())
 
     return torch.ops.nestedtensor.min_mha(num_heads,
                                           head_dim,
@@ -90,53 +86,6 @@ def multi_head_attention_forward(query,                           # type: Nested
                                           out_proj_weight,
                                           out_proj_bias), None
 
-    # This is inline in_proj function with in_proj_weight and in_proj_bias
-    _b = in_proj_bias
-    _start = 0
-    _end = embed_dim
-    _w = in_proj_weight[_start:_end, :]
-    print(_w.sum())
-    if _b is not None:
-        _b = _b[_start:_end]
-    q = F.linear(query, _w, _b)
-    print(q.sum())
-
-    # This is inline in_proj function with in_proj_weight and in_proj_bias
-    _b = in_proj_bias
-    _start = embed_dim
-    _end = embed_dim * 2
-    _w = in_proj_weight[_start:_end, :]
-    if _b is not None:
-        _b = _b[_start:_end]
-    k = F.linear(key, _w, _b)
-
-    # This is inline in_proj function with in_proj_weight and in_proj_bias
-    _b = in_proj_bias
-    _start = embed_dim * 2
-    _end = None
-    _w = in_proj_weight[_start:, :]
-    if _b is not None:
-        _b = _b[_start:]
-    v = F.linear(value, _w, _b)
-    q = q * scaling
-    print(q.sum())
-
-    # NOTE: This is usually contiguous plus a view
-    q = q.reshape(-1, -1, num_heads, head_dim).transpose(1, 2)
-    if k is not None:
-        k = k.reshape(-1, -1, num_heads, head_dim).transpose(1, 2)
-    if v is not None:
-        v = v.reshape(-1, -1, num_heads, head_dim).transpose(1, 2)
-    attn_output_weights = torch.matmul(q, k.transpose(2, 3))
-    attn_output_weights = F.softmax(
-        attn_output_weights, dim=-1)
-    attn_output_weights = F.dropout(
-        attn_output_weights, p=dropout_p, training=training)
-    attn_output = torch.matmul(attn_output_weights, v)
-    attn_output = attn_output.transpose(1, 2).reshape(-1, -1, embed_dim)
-    attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
-    return attn_output, None
-
 
 class MultiheadAttention(Module):
     __annotations__ = {

From 759861199cb721bbd76fd56d787881ac096629f6 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 21:01:14 -0700
Subject: [PATCH 06/12] Checkpoint

---
 nestedtensor/csrc/mha.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp
index 3083fa11..62303ff8 100644
--- a/nestedtensor/csrc/mha.cpp
+++ b/nestedtensor/csrc/mha.cpp
@@ -34,10 +34,6 @@ at::Tensor min_mha(
   int64_t edim = query.size(2);
 
   at::Tensor q, k, v;
-  // TODO: Use addmm!
-  // if input.dim() == 2 and bias is not None:
-  //     # fused op is marginally faster
-  //     ret = torch.addmm(bias, input, weight.t())
   if (in_proj_bias) {
     q = at::addmm(
         at::slice(*in_proj_bias, 0, 0, edim),
@@ -53,9 +49,6 @@ at::Tensor min_mha(
         at::slice(*in_proj_bias, 0, 2 * edim),
         value,
         at::slice(in_proj_weight, 0, 2 * edim).t());
-    //    q = q + at::slice(*in_proj_bias, 0, 0, edim);
-    //    k = k + at::slice(*in_proj_bias, 0, edim, 2 * edim);
-    //    v = v + at::slice(*in_proj_bias, 0, 2 * edim);
   } else {
     q = at::matmul(query, at::slice(in_proj_weight, 0, 0, edim).t());
     k = at::matmul(key, at::slice(in_proj_weight, 0, edim, 2 * edim).t());

From e8c591a13f9634f64d41241cd7770c5ce1fd9523 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 21:02:25 -0700
Subject: [PATCH 07/12] Checkpoint

---
 nestedtensor/csrc/py_init.cpp | 2 +-
 nestedtensor/nn/mha.py        | 3 ---
 nestedtensor/version.py       | 4 ++--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp
index c696738b..20cc0ef1 100644
--- a/nestedtensor/csrc/py_init.cpp
+++ b/nestedtensor/csrc/py_init.cpp
@@ -7,7 +7,6 @@
 #include <torch/csrc/autograd/python_variable_indexing.h>
 #include <torch/extension.h>
 #include <chrono>
-#include <nestedtensor/csrc/mha.h>
 
 // NOTE: A NestedTensor without any constituents, i.e.
 // nested_tensor([]) is of dimension 1 because
@@ -275,4 +274,5 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   });
 
   add_functions(m);
+  add_mha(m);
 }
diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py
index 868180e8..963a3444 100644
--- a/nestedtensor/nn/mha.py
+++ b/nestedtensor/nn/mha.py
@@ -47,9 +47,6 @@ def multi_head_attention_forward(query,                           # type: Nested
     assert isinstance(query, nestedtensor.NestedTensor)
     assert isinstance(key, nestedtensor.NestedTensor)
     assert isinstance(value, nestedtensor.NestedTensor)
-    query = query.contiguous()
-    key = key.contiguous()
-    value = value.contiguous()
     assert torch.is_tensor(out_proj_weight)
     assert torch.is_tensor(out_proj_bias)
 
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 5dbadc67..2a70879c 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20208253+eec9dfd'
-git_version = 'eec9dfdf38998aa76b6182a60bdef4df6eb7796f'
+__version__ = '0.0.1.dev20208254+7598611'
+git_version = '759861199cb721bbd76fd56d787881ac096629f6'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From bfefe757d4cbfd7acc7303497980ea9816c8e5f4 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 21:03:30 -0700
Subject: [PATCH 08/12] Checkpoint

---
 nestedtensor/csrc/py_init.cpp | 1 -
 nestedtensor/version.py       | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/nestedtensor/csrc/py_init.cpp b/nestedtensor/csrc/py_init.cpp
index 20cc0ef1..d2a1579d 100644
--- a/nestedtensor/csrc/py_init.cpp
+++ b/nestedtensor/csrc/py_init.cpp
@@ -274,5 +274,4 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   });
 
   add_functions(m);
-  add_mha(m);
 }
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 2a70879c..1e96683d 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20208254+7598611'
-git_version = '759861199cb721bbd76fd56d787881ac096629f6'
+__version__ = '0.0.1.dev20208254+e8c591a'
+git_version = 'e8c591a13f9634f64d41241cd7770c5ce1fd9523'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 8c381e21692b923eb2ba58b84c1fe5955ae207ad Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 21:04:22 -0700
Subject: [PATCH 09/12] Checkpoint

---
 nestedtensor/csrc/mha.h | 0
 nestedtensor/version.py | 4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)
 delete mode 100644 nestedtensor/csrc/mha.h

diff --git a/nestedtensor/csrc/mha.h b/nestedtensor/csrc/mha.h
deleted file mode 100644
index e69de29b..00000000
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 1e96683d..15a59192 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20208254+e8c591a'
-git_version = 'e8c591a13f9634f64d41241cd7770c5ce1fd9523'
+__version__ = '0.0.1.dev20208254+bfefe75'
+git_version = 'bfefe757d4cbfd7acc7303497980ea9816c8e5f4'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From c4d8dc2f5786e5ddb96a7458a319c253d0261e73 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 21:07:33 -0700
Subject: [PATCH 10/12] Checkpoint

---
 nestedtensor/csrc/mha.cpp | 3 +--
 nestedtensor/version.py   | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/nestedtensor/csrc/mha.cpp b/nestedtensor/csrc/mha.cpp
index 62303ff8..c3357774 100644
--- a/nestedtensor/csrc/mha.cpp
+++ b/nestedtensor/csrc/mha.cpp
@@ -64,8 +64,7 @@ at::Tensor min_mha(
   attn_output_weights = at::dropout(attn_output_weights, dropout_p, training);
   auto attn_output = at::matmul(attn_output_weights, v);
   attn_output = attn_output.transpose(1, 2).reshape({-1, -1, edim});
-  attn_output = at::matmul(attn_output, out_proj_weight.t());
-  attn_output += out_proj_bias;
+  attn_output = at::addmm(out_proj_bias, attn_output, out_proj_weight.t());
   return attn_output;
 }
 
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 15a59192..49ba8adc 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20208254+bfefe75'
-git_version = 'bfefe757d4cbfd7acc7303497980ea9816c8e5f4'
+__version__ = '0.0.1.dev20208254+8c381e2'
+git_version = '8c381e21692b923eb2ba58b84c1fe5955ae207ad'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From 2e7142023fb185bcfd166784632462e3ab0e87f6 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Mon, 24 Aug 2020 21:50:50 -0700
Subject: [PATCH 11/12] Checkpoint

---
 benchmarks/frozenbatchnorm2d.py          | 73 ++++++++++++++++++++++++
 benchmarks/mha.py                        |  9 ++-
 nestedtensor/csrc/autograd_functions.cpp |  2 +-
 nestedtensor/version.py                  |  4 +-
 4 files changed, 80 insertions(+), 8 deletions(-)
 create mode 100644 benchmarks/frozenbatchnorm2d.py

diff --git a/benchmarks/frozenbatchnorm2d.py b/benchmarks/frozenbatchnorm2d.py
new file mode 100644
index 00000000..4172ebb8
--- /dev/null
+++ b/benchmarks/frozenbatchnorm2d.py
@@ -0,0 +1,73 @@
+import torch
+import nestedtensor
+import utils
+import torchvision
+
+import random
+
+random.seed(1010)
+RAND_INTS = [random.randint(10, 30) for _ in range(2000)]
+RAND_INTS = [random.randint(100, 300) for _ in range(20)]
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt,
+    without which any other models than torchvision.models.resnet[18,34,50,101]
+    produce nans.
+    """
+
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d, self)._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        print("DHDHDH")
+        # move reshapes to the beginning
+        # to make it fuser-friendly
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return (x * scale + bias).squeeze(1)
+
+MODEL = FrozenBatchNorm2d(64)
+
+def gen_t_loop_frozenbatchnorm2d():
+    tensors = [torch.rand(64, i, 256).cuda() for i in RAND_INTS]
+
+    def t_loop():
+        for t in tensors:
+            MODEL(t.unsqueeze(0))
+    return t_loop
+
+
+def gen_nt_frozenbatchnorm2d():
+    nt0 = nestedtensor.nested_tensor(
+        [torch.rand(64, i, 256).cuda() for i in RAND_INTS])
+
+    def nt():
+        MODEL(nt0)
+    return nt
+
+
+if __name__ == "__main__":
+    print(utils.benchmark_fn(gen_nt_frozenbatchnorm2d()))
+    print(utils.benchmark_fn(gen_t_loop_frozenbatchnorm2d()))
diff --git a/benchmarks/mha.py b/benchmarks/mha.py
index d07dc516..8fbcbaf4 100644
--- a/benchmarks/mha.py
+++ b/benchmarks/mha.py
@@ -5,7 +5,6 @@
 
 import random
 
-# Performance tanks hard for lots of small Tensors as expected
 random.seed(1010)
 RAND_INTS = [random.randint(10, 30) for _ in range(2000)]
 RAND_INTS = [random.randint(100, 300) for _ in range(20)]
@@ -14,7 +13,7 @@
 MODEL0 = torch.nn.MultiheadAttention(256, 8, dropout=0.1).cuda()
 MODEL1 = nestedtensor.nn.MultiheadAttention(256, 8, dropout=0.1).cuda()
 
-def gen_t_loop_segmentation():
+def gen_t_loop_mha():
     tensors = [torch.rand(1, i, 256).cuda() for i in RAND_INTS]
 
     def t_loop():
@@ -23,7 +22,7 @@ def t_loop():
     return t_loop
 
 
-def gen_nt_segmentation():
+def gen_nt_mha():
     nt0 = nestedtensor.nested_tensor(
         [torch.rand(i, 256).cuda() for i in RAND_INTS])
 
@@ -33,5 +32,5 @@ def nt():
 
 
 if __name__ == "__main__":
-    print(utils.benchmark_fn(gen_nt_segmentation()))
-    print(utils.benchmark_fn(gen_t_loop_segmentation()))
+    print(utils.benchmark_fn(gen_nt_mha()))
+    print(utils.benchmark_fn(gen_t_loop_mha()))
diff --git a/nestedtensor/csrc/autograd_functions.cpp b/nestedtensor/csrc/autograd_functions.cpp
index 5018dd82..7312acc1 100644
--- a/nestedtensor/csrc/autograd_functions.cpp
+++ b/nestedtensor/csrc/autograd_functions.cpp
@@ -58,7 +58,7 @@ struct NestedTensorFunction_batch_norm
                      cudnn_enabled)
               .squeeze(0);
         },
-        autograd_input);
+        autograd_input).contiguous();
     ctx->saved_data["0"] = weight;
     ctx->saved_data["1"] = bias;
     ctx->saved_data["2"] = autograd_output;
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 49ba8adc..46b83c65 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20208254+8c381e2'
-git_version = '8c381e21692b923eb2ba58b84c1fe5955ae207ad'
+__version__ = '0.0.1.dev20208254+c4d8dc2'
+git_version = 'c4d8dc2f5786e5ddb96a7458a319c253d0261e73'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION

From f5e6a469c1b28d811a5ecec269915876f03039a2 Mon Sep 17 00:00:00 2001
From: Christian Puhrsch <cpuhrsch@devfair0129.h2.fair>
Date: Tue, 25 Aug 2020 07:10:18 -0700
Subject: [PATCH 12/12] Checkpoint

---
 benchmarks/frozenbatchnorm2d.py |  8 ++++++--
 benchmarks/mha.py               |  5 -----
 benchmarks/utils.py             |  1 +
 nestedtensor/csrc/BinaryOps.cpp | 16 ++++++++++------
 nestedtensor/csrc/functions.cpp |  2 +-
 nestedtensor/nn/mha.py          |  3 +++
 nestedtensor/version.py         |  4 ++--
 7 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/benchmarks/frozenbatchnorm2d.py b/benchmarks/frozenbatchnorm2d.py
index 4172ebb8..4945de2f 100644
--- a/benchmarks/frozenbatchnorm2d.py
+++ b/benchmarks/frozenbatchnorm2d.py
@@ -36,7 +36,6 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
             missing_keys, unexpected_keys, error_msgs)
 
     def forward(self, x):
-        print("DHDHDH")
         # move reshapes to the beginning
         # to make it fuser-friendly
         w = self.weight.reshape(1, -1, 1, 1)
@@ -46,9 +45,14 @@ def forward(self, x):
         eps = 1e-5
         scale = w * (rv + eps).rsqrt()
         bias = b - rm * scale
+        # print(scale.size())
+        # print(bias.size())
+        # print(type(scale))
+        # print(type(bias))
+        # print(x.nested_size())
         return (x * scale + bias).squeeze(1)
 
-MODEL = FrozenBatchNorm2d(64)
+MODEL = FrozenBatchNorm2d(64).cuda()
 
 def gen_t_loop_frozenbatchnorm2d():
     tensors = [torch.rand(64, i, 256).cuda() for i in RAND_INTS]
diff --git a/benchmarks/mha.py b/benchmarks/mha.py
index 776a9aaf..8fbcbaf4 100644
--- a/benchmarks/mha.py
+++ b/benchmarks/mha.py
@@ -32,10 +32,5 @@ def nt():
 
 
 if __name__ == "__main__":
-<<<<<<< HEAD
     print(utils.benchmark_fn(gen_nt_mha()))
     print(utils.benchmark_fn(gen_t_loop_mha()))
-=======
-    print(utils.benchmark_fn(gen_nt_segmentation()))
-    print(utils.benchmark_fn(gen_t_loop_segmentation()))
->>>>>>> master
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
index d1268602..61e0fc5f 100644
--- a/benchmarks/utils.py
+++ b/benchmarks/utils.py
@@ -28,6 +28,7 @@ def benchmark_fn(fn, run_time = 5.0, use_cprofile=False, warmup=1.0, cuda=False)
         if use_cprofile:
             pr.enable()
         fn()
+        # import sys; sys.exit(1)
         if cuda:
             torch.cuda.synchronize()
         if use_cprofile:
diff --git a/nestedtensor/csrc/BinaryOps.cpp b/nestedtensor/csrc/BinaryOps.cpp
index c91a0587..b020dfe3 100644
--- a/nestedtensor/csrc/BinaryOps.cpp
+++ b/nestedtensor/csrc/BinaryOps.cpp
@@ -43,14 +43,18 @@ Tensor NestedTensor_binary(const Tensor& self, const Tensor& other) {
     return map_nested_tensor(
         [&self](Tensor other) { return func(self, other); }, other);
   }
-  if (is_packed(self) && (other.dim() == 0 || (other.dim() == 1 && other.numel() == 1))) {
+  if (is_packed(self)) {
+    auto self_structure = get_nested_tensor_structure(self);
+    auto self_impl = get_nested_tensor_impl(self);
+    if (other.dim() == 0 || (other.dim() == 1 && other.numel() == 1)) {
 #ifdef TRACEPACKED
-    std::cout << "calling packed binary " << typeid(func).name() << std::endl;
+      std::cout << "calling packed binary NT x T 0-dim / 1-dim 1-numel"
+                << typeid(func).name() << std::endl;
 #endif
-    auto self_structure = get_nested_tensor_structure(self);
-    return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
-        func((*self_structure.buffer()), other),
-        get_nested_tensor_impl(self)->nested_size()));
+      return wrap_tensor_node(torch::nested_tensor::impl::build_structure(
+          func((*self_structure.buffer()), other),
+          get_nested_tensor_impl(self)->nested_size()));
+    }
   }
   return map_nested_tensor(
       [&other](Tensor self) { return func(self, other); }, self);
diff --git a/nestedtensor/csrc/functions.cpp b/nestedtensor/csrc/functions.cpp
index aa6b5ddf..d15dd2c6 100644
--- a/nestedtensor/csrc/functions.cpp
+++ b/nestedtensor/csrc/functions.cpp
@@ -139,7 +139,7 @@ Tensor NestedTensor_layer_norm(
       [normalized_shape, &weight, &bias, eps](const at::Tensor t) {
         return at::layer_norm(t, normalized_shape, weight, bias, eps, true);
       },
-      input);
+      input).contiguous();
 }
 
 Tensor NestedTensor_all(const Tensor& self) {
diff --git a/nestedtensor/nn/mha.py b/nestedtensor/nn/mha.py
index 963a3444..84e6f7af 100644
--- a/nestedtensor/nn/mha.py
+++ b/nestedtensor/nn/mha.py
@@ -49,6 +49,9 @@ def multi_head_attention_forward(query,                           # type: Nested
     assert isinstance(value, nestedtensor.NestedTensor)
     assert torch.is_tensor(out_proj_weight)
     assert torch.is_tensor(out_proj_bias)
+    query = query.contiguous()
+    key = key.contiguous()
+    value = value.contiguous()
 
     # TODO: Explicitly unsupported flags
     assert not use_separate_proj_weight
diff --git a/nestedtensor/version.py b/nestedtensor/version.py
index 46b83c65..73166668 100644
--- a/nestedtensor/version.py
+++ b/nestedtensor/version.py
@@ -1,5 +1,5 @@
-__version__ = '0.0.1.dev20208254+c4d8dc2'
-git_version = 'c4d8dc2f5786e5ddb96a7458a319c253d0261e73'
+__version__ = '0.0.1.dev20208255+faee8a1'
+git_version = 'faee8a1a2578f7ecb80098d2cb792ea7c22e61ab'
 from nestedtensor import _C
 if hasattr(_C, 'CUDA_VERSION'):
     cuda = _C.CUDA_VERSION