From 672134a32a2e7b69698e61101961a5fc87629644 Mon Sep 17 00:00:00 2001
From: Jiayu Liu <jiayu@hey.com>
Date: Tue, 6 Feb 2024 16:01:42 +0800
Subject: [PATCH] implement groups for conv1d and conv2d

---
 CONTRIBUTING.md                          |  24 ++--
 benchmarks/python/comparative/compare.py |   6 +-
 mlx/ops.cpp                              | 136 +++++++++++++++++------
 python/mlx/nn/layers/convolution.py      |  33 +++++-
 python/tests/test_load.py                |  16 ++-
 python/tests/test_nn.py                  |   5 +
 6 files changed, 160 insertions(+), 60 deletions(-)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f1531bb88..12cd354a0 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -5,26 +5,26 @@ possible.
 
 ## Pull Requests
 
-1. Fork and submit pull requests to the repo. 
+1. Fork and submit pull requests to the repo.
 2. If you've added code that should be tested, add tests.
 3. If a change is likely to impact efficiency, run some of the benchmarks before
    and after the change. Examples of benchmarks can be found in `benchmarks/python/`.
 4. If you've changed APIs, update the documentation.
-5. Every PR should have passing tests and at least one review. 
+5. Every PR should have passing tests and at least one review.
 6. For code formatting install `pre-commit` using something like `pip install pre-commit` and run `pre-commit install`.
    This should install hooks for running `black` and `clang-format` to ensure
    consistent style for C++ and python code.
- 
+
    You can also run the formatters manually as follows:
- 
-     ```
-     clang-format -i file.cpp
-     ```
- 
-     ```
-     black file.py
-     ```
- 
+
+   ```bash
+   clang-format -i file.cpp
+   ```
+
+   ```bash
+   black file.py
+   ```
+
    or run `pre-commit run --all-files` to check all files in the repo.
 
 ## Issues
diff --git a/benchmarks/python/comparative/compare.py b/benchmarks/python/comparative/compare.py
index a9d3df22d..5b71cf583 100644
--- a/benchmarks/python/comparative/compare.py
+++ b/benchmarks/python/comparative/compare.py
@@ -80,10 +80,8 @@ def predicate(x):
     _filter = make_predicate(args.filter, args.negative_filter)
 
     if args.mlx_dtypes:
-        compare_filtered = (
-            lambda x: compare_mlx_dtypes(
-                x.split() + rest, args.mlx_dtypes[0], args.mlx_dtypes[1]
-            )
+        compare_filtered = lambda x: (
+            compare_mlx_dtypes(x.split() + rest, args.mlx_dtypes[0], args.mlx_dtypes[1])
             if _filter(x)
             else None
         )
diff --git a/mlx/ops.cpp b/mlx/ops.cpp
index 01ee6d388..87a95986a 100644
--- a/mlx/ops.cpp
+++ b/mlx/ops.cpp
@@ -2734,7 +2734,8 @@ inline std::vector<int> conv_out_shape(
   return out_shape;
 }
 
-inline void run_conv_checks(const array& in, const array& wt, int n_dim) {
+inline void
+run_conv_checks(const array& in, const array& wt, int n_dim, int n_groups) {
   if (!is_floating_point(in.dtype()) && kindof(in.dtype()) != Dtype::Kind::c) {
     std::ostringstream msg;
     msg << "[conv] Invalid input array with type " << in.dtype() << "."
@@ -2767,6 +2768,22 @@ inline void run_conv_checks(const array& in, const array& wt, int n_dim) {
         << " input: " << in.shape() << " and weight: " << wt.shape();
     throw std::invalid_argument(msg.str());
   }
+
+  if (in.shape(n_dim + 1) % n_groups != 0) {
+    std::ostringstream msg;
+    msg << "[conv] The number of input channels must be divisible by the number"
+        << " of groups. Got input with shape " << in.shape() << " and groups "
+        << n_groups << ".";
+    throw std::invalid_argument(msg.str());
+  }
+
+  if (wt.shape(n_dim + 1) % n_groups != 0) {
+    std::ostringstream msg;
+    msg << "[conv] The number of output channels must be divisible by the number"
+        << " of groups. Got weight with shape " << wt.shape() << " and groups "
+        << n_groups << ".";
+    throw std::invalid_argument(msg.str());
+  }
 }
 
 } // namespace
@@ -2781,15 +2798,15 @@ array conv1d(
     int groups /* = 1 */,
     StreamOrDevice s /* = {} */) {
   // Run checks
-  if (groups != 1) {
-    throw std::invalid_argument("[conv1d] Cannot handle groups != 1 yet");
+  if (groups < 1) {
+    throw std::invalid_argument("[conv1d] Invalid groups < 1");
   }
   if (dilation != 1) {
     throw std::invalid_argument("[conv1d] Cannot handle dilation != 1 yet");
   }
 
   // Run checks
-  run_conv_checks(in_, wt_, 1);
+  run_conv_checks(in_, wt_, 1, groups);
 
   auto in = in_;
   auto wt = wt_;
@@ -2802,21 +2819,48 @@ array conv1d(
   std::vector<int> strides_vec = {stride};
   std::vector<int> padding_vec = {padding};
   std::vector<int> dilation_vec = {dilation};
+  std::vector<int> input_dilation_vec = {1, 1};
 
-  // Get output shapes
-  std::vector<int> out_shape = conv_out_shape(
-      in.shape(), wt.shape(), strides_vec, padding_vec, dilation_vec);
-
-  return array(
-      out_shape,
-      in.dtype(),
-      std::make_unique<Convolution>(
-          to_stream(s),
-          padding_vec,
+  if (groups == 1) {
+    // Get output shapes
+    std::vector<int> out_shape = conv_out_shape(
+        in.shape(), wt.shape(), strides_vec, padding_vec, dilation_vec);
+    return array(
+        out_shape,
+        in.dtype(),
+        std::make_unique<Convolution>(
+            to_stream(s),
+            padding_vec,
+            strides_vec,
+            dilation_vec,
+            input_dilation_vec),
+        {in, wt});
+  } else {
+    // Grouped convolution
+    auto in_slices = split(in, groups, -1, s);
+    auto wt_slices = split(wt, groups, 0, s);
+    std::vector<array> out_slices;
+    for (auto i = 0; i < groups; i++) {
+      auto out_shape = conv_out_shape(
+          in_slices[i].shape(),
+          wt_slices[i].shape(),
           strides_vec,
-          dilation_vec,
-          std::vector<int>(1, 1)),
-      {in, wt});
+          padding_vec,
+          dilation_vec);
+      auto out_slice = array(
+          out_shape,
+          in.dtype(),
+          std::make_unique<Convolution>(
+              to_stream(s),
+              padding_vec,
+              strides_vec,
+              dilation_vec,
+              input_dilation_vec),
+          {in_slices[i], wt_slices[i]});
+      out_slices.push_back(out_slice);
+    }
+    return concatenate(out_slices, -1, s);
+  }
 }
 
 /** 2D convolution with a filter */
@@ -2829,15 +2873,15 @@ array conv2d(
     int groups /* = 1 */,
     StreamOrDevice s /* = {} */) {
   // Run checks
-  if (groups != 1) {
-    throw std::invalid_argument("[conv2d] Cannot handle groups != 1 yet");
+  if (groups < 1) {
+    throw std::invalid_argument("[conv2d] Invalid groups < 1");
   }
   if (dilation.first != 1 || dilation.second != 1) {
     throw std::invalid_argument("[conv2d] Cannot handle dilation != 1 yet");
   }
 
   // Run checks
-  run_conv_checks(in_, wt_, 2);
+  run_conv_checks(in_, wt_, 2, groups);
 
   auto in = in_;
   auto wt = wt_;
@@ -2850,21 +2894,49 @@ array conv2d(
   std::vector<int> strides_vec = {stride.first, stride.second};
   std::vector<int> padding_vec = {padding.first, padding.second};
   std::vector<int> dilation_vec = {dilation.first, dilation.second};
+  std::vector<int> input_dilation_vec = {2, 1};
 
-  // Get output shapes
-  std::vector<int> out_shape = conv_out_shape(
-      in.shape(), wt.shape(), strides_vec, padding_vec, dilation_vec);
+  if (groups == 1) {
+    // Get output shapes
+    std::vector<int> out_shape = conv_out_shape(
+        in.shape(), wt.shape(), strides_vec, padding_vec, dilation_vec);
 
-  return array(
-      out_shape,
-      in.dtype(),
-      std::make_unique<Convolution>(
-          to_stream(s),
-          padding_vec,
+    return array(
+        out_shape,
+        in.dtype(),
+        std::make_unique<Convolution>(
+            to_stream(s),
+            padding_vec,
+            strides_vec,
+            dilation_vec,
+            input_dilation_vec),
+        {in, wt});
+  } else {
+    // Grouped convolution
+    auto in_slices = split(in, groups, -1, s);
+    auto wt_slices = split(wt, groups, 0, s);
+    std::vector<array> out_slices;
+    for (auto i = 0; i < groups; i++) {
+      auto out_shape = conv_out_shape(
+          in_slices[i].shape(),
+          wt_slices[i].shape(),
           strides_vec,
-          dilation_vec,
-          std::vector<int>(2, 1)),
-      {in, wt});
+          padding_vec,
+          dilation_vec);
+      auto out_slice = array(
+          out_shape,
+          in.dtype(),
+          std::make_unique<Convolution>(
+              to_stream(s),
+              padding_vec,
+              strides_vec,
+              dilation_vec,
+              input_dilation_vec),
+          {in_slices[i], wt_slices[i]});
+      out_slices.push_back(out_slice);
+    }
+    return concatenate(out_slices, -1, s);
+  }
 }
 
 array quantized_matmul(
diff --git a/python/mlx/nn/layers/convolution.py b/python/mlx/nn/layers/convolution.py
index c6928e188..3b3679825 100644
--- a/python/mlx/nn/layers/convolution.py
+++ b/python/mlx/nn/layers/convolution.py
@@ -23,6 +23,9 @@ class Conv1d(Module):
             Default: 1.
         padding (int, optional): How many positions to 0-pad the input with.
             Default: 0.
+        dilation (int, optional): The size of the dilation. Default: 1.
+        groups (int, optional): The number of groups to split the input.
+            Default: 1.
         bias (bool, optional): If ``True`` add a learnable bias to the output.
             Default: ``True``
     """
@@ -34,6 +37,8 @@ def __init__(
         kernel_size: int,
         stride: int = 1,
         padding: int = 0,
+        dilation: int = 1,
+        groups: int = 1,
         bias: bool = True,
     ):
         super().__init__()
@@ -49,16 +54,21 @@ def __init__(
 
         self.padding = padding
         self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
 
     def _extra_repr(self):
         return (
             f"{self.weight.shape[-1]}, {self.weight.shape[0]}, "
             f"kernel_size={self.weight.shape[1]}, stride={self.stride}, "
-            f"padding={self.padding}, bias={'bias' in self}"
+            f"padding={self.padding}, dilation={self.dilation}, "
+            f"groups={self.groups} bias={'bias' in self}"
         )
 
     def __call__(self, x):
-        y = mx.conv1d(x, self.weight, self.stride, self.padding)
+        y = mx.conv1d(
+            x, self.weight, self.stride, self.padding, self.dilation, self.groups
+        )
         if "bias" in self:
             y = y + self.bias
         return y
@@ -81,6 +91,10 @@ class Conv2d(Module):
             applying the filter. Default: 1.
         padding (int or tuple, optional): How many positions to 0-pad
             the input with. Default: 0.
+        dilation (int or tuple, optional): The size of the dilation.
+            Default: 1.
+        groups (int, optional): The number of groups to split the input.
+            Default: 1.
         bias (bool, optional): If ``True`` add a learnable bias to the
             output. Default: ``True``
     """
@@ -92,13 +106,15 @@ def __init__(
         kernel_size: Union[int, tuple],
         stride: Union[int, tuple] = 1,
         padding: Union[int, tuple] = 0,
+        dilation: Union[int, tuple] = 1,
+        groups: int = 1,
         bias: bool = True,
     ):
         super().__init__()
 
-        kernel_size, stride, padding = map(
+        kernel_size, stride, padding, dilation = map(
             lambda x: (x, x) if isinstance(x, int) else x,
-            (kernel_size, stride, padding),
+            (kernel_size, stride, padding, dilation),
         )
         scale = math.sqrt(1 / (in_channels * kernel_size[0] * kernel_size[1]))
         self.weight = mx.random.uniform(
@@ -111,16 +127,21 @@ def __init__(
 
         self.padding = padding
         self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
 
     def _extra_repr(self):
         return (
             f"{self.weight.shape[-1]}, {self.weight.shape[0]}, "
             f"kernel_size={self.weight.shape[1:2]}, stride={self.stride}, "
-            f"padding={self.padding}, bias={'bias' in self}"
+            f"padding={self.padding}, dilation={self.dilation}, "
+            f"groups={self.groups}, bias={'bias' in self}"
         )
 
     def __call__(self, x):
-        y = mx.conv2d(x, self.weight, self.stride, self.padding)
+        y = mx.conv2d(
+            x, self.weight, self.stride, self.padding, self.dilation, self.groups
+        )
         if "bias" in self:
             y = y + self.bias
         return y
diff --git a/python/tests/test_load.py b/python/tests/test_load.py
index a37ba83a9..fdd42351d 100644
--- a/python/tests/test_load.py
+++ b/python/tests/test_load.py
@@ -75,9 +75,11 @@ def test_save_and_load_safetensors(self):
                             self.test_dir, f"mlx_{dt}_{i}_fs.safetensors"
                         )
                         save_dict = {
-                            "test": mx.random.normal(shape=shape, dtype=getattr(mx, dt))
-                            if dt in ["float32", "float16", "bfloat16"]
-                            else mx.ones(shape, dtype=getattr(mx, dt))
+                            "test": (
+                                mx.random.normal(shape=shape, dtype=getattr(mx, dt))
+                                if dt in ["float32", "float16", "bfloat16"]
+                                else mx.ones(shape, dtype=getattr(mx, dt))
+                            )
                         }
 
                         with open(save_file_mlx, "wb") as f:
@@ -104,9 +106,11 @@ def test_save_and_load_gguf(self):
                             self.test_dir, f"mlx_{dt}_{i}_fs.gguf"
                         )
                         save_dict = {
-                            "test": mx.random.normal(shape=shape, dtype=getattr(mx, dt))
-                            if dt in ["float32", "float16", "bfloat16"]
-                            else mx.ones(shape, dtype=getattr(mx, dt))
+                            "test": (
+                                mx.random.normal(shape=shape, dtype=getattr(mx, dt))
+                                if dt in ["float32", "float16", "bfloat16"]
+                                else mx.ones(shape, dtype=getattr(mx, dt))
+                            )
                         }
 
                         mx.save_gguf(save_file_mlx, save_dict)
diff --git a/python/tests/test_nn.py b/python/tests/test_nn.py
index 7749e159a..af492bbc8 100644
--- a/python/tests/test_nn.py
+++ b/python/tests/test_nn.py
@@ -627,6 +627,11 @@ def test_conv2d(self):
         self.assertEqual(y.shape, (4, 3, 3, 8))
         self.assertLess(mx.abs(y - c.weight.sum((1, 2, 3))).max(), 1e-4)
 
+        # 3x3 conv with groups = 3
+        c = nn.Conv2d(3, 6, 3, groups=3)
+        y = c(x)
+        self.assertEqual(y.shape, (4, 6, 6, 6))
+
     def test_sequential(self):
         x = mx.ones((10, 2))
         m = nn.Sequential(nn.Linear(2, 10), nn.ReLU(), nn.Linear(10, 1))