Support of strides in the convolutional layers (#239)

jvdp1 · milancurcic · web-flow · commit a1d2d241ecb1 · 2025-12-09T15:28:20.000-05:00
* Addition of stride in API of conv

* implementation of stride in conv1d

* start implementation of stride in conv2d

* Fix conv1d with stride

* Implementation of stride

* Apply suggestions from code review

* Fix prior bug in the accumulation of gradients in the conv2d backward pass

* Tidy up

---------

Co-authored-by: milancurcic &lt;caomaco@gmail.com&gt;
diff --git a/example/cnn_mnist.f90 b/example/cnn_mnist.f90
@@ -12,7 +12,7 @@ program cnn_mnist
   real, allocatable :: validation_images(:,:), validation_labels(:)
   real, allocatable :: testing_images(:,:), testing_labels(:)
   integer :: n
-  integer, parameter :: num_epochs = 250
+  integer, parameter :: num_epochs = 20
 
   call load_mnist(training_images, training_labels, &
                   validation_images, validation_labels, &
diff --git a/src/nf/nf_conv1d_layer.f90 b/src/nf/nf_conv1d_layer.f90
@@ -15,6 +15,7 @@ module nf_conv1d_layer
       integer :: channels
       integer :: kernel_size
       integer :: filters
+      integer :: stride
   
       real, allocatable :: biases(:) ! size(filters)
       real, allocatable :: kernel(:,:,:) ! filters x channels x window 
@@ -39,12 +40,13 @@ module nf_conv1d_layer
     end type conv1d_layer
   
     interface conv1d_layer
-      module function conv1d_layer_cons(filters, kernel_size, activation) &
+      module function conv1d_layer_cons(filters, kernel_size, activation, stride) &
         result(res)
         !! `conv1d_layer` constructor function
         integer, intent(in) :: filters
         integer, intent(in) :: kernel_size
         class(activation_function), intent(in) :: activation
+        integer, intent(in) :: stride
         type(conv1d_layer) :: res
       end function conv1d_layer_cons
     end interface conv1d_layer
diff --git a/src/nf/nf_conv1d_layer_submodule.f90 b/src/nf/nf_conv1d_layer_submodule.f90
@@ -7,15 +7,17 @@
 
 contains
 
-  module function conv1d_layer_cons(filters, kernel_size, activation) result(res)
+  module function conv1d_layer_cons(filters, kernel_size, activation, stride) result(res)
     integer, intent(in) :: filters
     integer, intent(in) :: kernel_size
     class(activation_function), intent(in) :: activation
+    integer, intent(in) :: stride
     type(conv1d_layer) :: res
 
     res % kernel_size = kernel_size
     res % filters = filters
     res % activation_name = activation % get_name()
+    res % stride = stride
     allocate( res % activation, source = activation )
   end function conv1d_layer_cons
 
@@ -24,7 +26,9 @@ module subroutine init(self, input_shape)
     integer, intent(in) :: input_shape(:)
 
     self % channels = input_shape(1)
-    self % width = input_shape(2) - self % kernel_size + 1
+    self % width = (input_shape(2) - self % kernel_size) / self % stride +1
+
+    if (mod(input_shape(2) - self % kernel_size , self % stride) /= 0) self % width = self % width + 1
 
     ! Output of shape: filters x width
     allocate(self % output(self % filters, self % width))
@@ -55,19 +59,22 @@ end subroutine init
   pure module subroutine forward(self, input)
     class(conv1d_layer), intent(in out) :: self
     real, intent(in) :: input(:,:)
+    integer :: input_width
     integer :: j, n
     integer :: iws, iwe
 
+    input_width = size(input, dim=2)
+
     ! Loop over output positions.
     do j = 1, self % width
       ! Compute the input window corresponding to output index j.
       ! In forward: center index = j + half_window, so window = indices j to j+kernel_size-1.
-      iws = j
-      iwe = j + self % kernel_size - 1
+      iws = self % stride * (j-1) + 1
+      iwe = min(iws + self % kernel_size - 1, input_width)
 
       ! For each filter, compute the convolution (inner product over channels and kernel width).
       do concurrent (n = 1:self % filters)
-        self % z(n, j) = sum(self % kernel(n,:,:) * input(:,iws:iwe))
+        self % z(n, j) = sum(self % kernel(n,:,1:iwe-iws+1) * input(:,iws:iwe))
       end do
 
       ! Add the bias for each filter.
@@ -85,6 +92,7 @@ pure module subroutine backward(self, input, gradient)
     real, intent(in) :: input(:,:)
     real, intent(in) :: gradient(:,:)
 
+    integer :: input_channels, input_width
     integer :: j, n, k
     integer :: iws, iwe
 
@@ -93,6 +101,8 @@ pure module subroutine backward(self, input, gradient)
     real :: db_local(self % filters)
     real :: dw_local(self % filters, self % channels, self % kernel_size)
 
+    input_width = size(input, dim=2)
+
     !--- Compute the local gradient gdz = (dL/dy) * sigma'(z) for each output.
     gdz = gradient * self % activation % eval_prime(self % z)
 
@@ -108,13 +118,13 @@ pure module subroutine backward(self, input, gradient)
     !   iws = j,  iwe = j + kernel_size - 1.
     do n = 1, self % filters
       do j = 1, self % width
-        iws = j
-        iwe = j + self % kernel_size - 1
+        iws = self % stride * (j-1) + 1
+        iwe = min(iws + self % kernel_size - 1, input_width)
         do k = 1, self % channels
           ! Weight gradient: accumulate contribution from the input window.
-          dw_local(n,k,:) = dw_local(n,k,:) + input(k,iws:iwe) * gdz(n,j)
+          dw_local(n,k,1:iwe-iws+1) = dw_local(n,k,1:iwe-iws+1) + input(k,iws:iwe) * gdz(n,j)
           ! Input gradient: propagate gradient back to the input window.
-          self % gradient(k,iws:iwe) = self % gradient(k,iws:iwe) + self % kernel(n,k,:) * gdz(n,j)
+          self % gradient(k,iws:iwe) = self % gradient(k,iws:iwe) + self % kernel(n,k,1:iwe-iws+1) * gdz(n,j)
         end do
       end do
     end do
diff --git a/src/nf/nf_conv2d_layer.f90 b/src/nf/nf_conv2d_layer.f90
@@ -16,6 +16,7 @@ module nf_conv2d_layer
     integer :: channels
     integer :: kernel_size
     integer :: filters
+    integer :: stride(2)
 
     real, allocatable :: biases(:) ! size(filters)
     real, allocatable :: kernel(:,:,:,:) ! filters x channels x window x window
@@ -40,12 +41,13 @@ module nf_conv2d_layer
   end type conv2d_layer
 
   interface conv2d_layer
-    module function conv2d_layer_cons(filters, kernel_size, activation) &
+    module function conv2d_layer_cons(filters, kernel_size, activation, stride) &
       result(res)
       !! `conv2d_layer` constructor function
       integer, intent(in) :: filters
       integer, intent(in) :: kernel_size
       class(activation_function), intent(in) :: activation
+      integer, intent(in) :: stride(:)
       type(conv2d_layer) :: res
     end function conv2d_layer_cons
   end interface conv2d_layer
diff --git a/src/nf/nf_conv2d_layer_submodule.f90 b/src/nf/nf_conv2d_layer_submodule.f90
@@ -7,16 +7,18 @@
 
 contains
 
-  module function conv2d_layer_cons(filters, kernel_size, activation) result(res)
+  module function conv2d_layer_cons(filters, kernel_size, activation, stride) result(res)
     implicit none
     integer, intent(in) :: filters
     integer, intent(in) :: kernel_size
     class(activation_function), intent(in) :: activation
+    integer, intent(in) :: stride(:)
     type(conv2d_layer) :: res
 
     res % kernel_size = kernel_size
     res % filters = filters
     res % activation_name = activation % get_name()
+    res % stride = stride
     allocate( res % activation, source = activation )
 
   end function conv2d_layer_cons
@@ -28,8 +30,12 @@ module subroutine init(self, input_shape)
     integer, intent(in) :: input_shape(:)
 
     self % channels = input_shape(1)
-    self % width = input_shape(2) - self % kernel_size + 1
-    self % height = input_shape(3) - self % kernel_size + 1
+
+    self % width = (input_shape(2) - self % kernel_size) / self % stride(1) + 1
+    if (mod(input_shape(2) - self % kernel_size , self % stride(1)) /= 0) self % width = self % width + 1
+
+    self % height = (input_shape(3) - self % kernel_size) / self % stride(2) + 1
+    if (mod(input_shape(3) - self % kernel_size , self % stride(2)) /= 0) self % height = self % height + 1
 
     ! Output of shape filters x width x height
     allocate(self % output(self % filters, self % width, self % height))
@@ -83,25 +89,24 @@ pure module subroutine forward(self, input)
     ! of the input that correspond to the center of each window.
     istart = half_window + 1 ! TODO kernel_width
     jstart = half_window + 1 ! TODO kernel_height
-    iend = input_width - istart + 1
-    jend = input_height - jstart + 1
 
-    convolution: do concurrent(i = istart:iend, j = jstart:jend)
+    convolution: do concurrent(i = 1:self % width, j = 1:self % height)
 
       ! Start and end indices of the input data on the filter window
       ! iws and jws are also coincidentally the indices of the output matrix
-      iws = i - half_window ! TODO kernel_width
-      iwe = i + half_window ! TODO kernel_width
-      jws = j - half_window ! TODO kernel_height
-      jwe = j + half_window ! TODO kernel_height
+      iws = istart + self % stride(1) * (i-1) - half_window ! TODO kernel_width
+      iwe = min(iws + 2*half_window, input_width)           ! TODO kernel_width
+
+      jws = jstart + self % stride(2) * (j-1) - half_window ! TODO kernel_height
+      jwe = min(jws + 2*half_window, input_height)          ! TODO kernel_height
 
       ! Compute the inner tensor product, sum(w_ij * x_ij), for each filter.
       do concurrent(n = 1:self % filters)
-        self % z(n,iws,jws) = sum(self % kernel(n,:,:,:) * input(:,iws:iwe,jws:jwe))
+        self % z(n,i,j) = sum(self % kernel(n,:,1:iwe-iws+1,1:jwe-jws+1) * input(:,iws:iwe,jws:jwe))
       end do
 
       ! Add bias to the inner product.
-      self % z(:,iws,jws) = self % z(:,iws,jws) + self % biases
+      self % z(:,i,j) = self % z(:,i,j) + self % biases
 
     end do convolution
 
@@ -156,21 +161,22 @@ pure module subroutine backward(self, input, gradient)
     do concurrent( &
       n = 1:self % filters, &
       k = 1:self % channels, &
-      i = istart:iend, &
-      j = jstart:jend &
+      i = 1:self % width, &
+      j = 1:self % height &
     )
       ! Start and end indices of the input data on the filter window
-      iws = i - half_window ! TODO kernel_width
-      iwe = i + half_window ! TODO kernel_width
-      jws = j - half_window ! TODO kernel_height
-      jwe = j + half_window ! TODO kernel_height
+      iws = istart + self % stride(1) * (i-1) - half_window ! TODO kernel_width
+      iwe = min(iws + 2*half_window, input_width)           ! TODO kernel_width
+
+      jws = jstart + self % stride(2) * (j-1) - half_window ! TODO kernel_height
+      jwe = min(jws + 2*half_window, input_height)          ! TODO kernel_height
 
-      ! dL/dw = sum(dL/dy * sigma'(z) * x)
-      dw(n,k,:,:) = dw(n,k,:,:) + input(k,iws:iwe,jws:jwe) * gdz(n,iws:iwe,jws:jwe)
+      ! dL/dw = sum(gdz * x)
+      dw(n,k,:,:) = dw(n,k,:,:) + input(k,iws:iwe,jws:jwe) * gdz(n,i,j)
 
-      ! dL/dx = dL/dy * sigma'(z) .inner. w
-      self % gradient(k,i,j) = self % gradient(k,i,j) &
-        + sum(gdz(n,iws:iwe,jws:jwe) * self % kernel(n,k,:,:))
+      ! dL/dx = sum(gdz * w)
+      self % gradient(k,iws:iwe,jws:jwe) = self % gradient(k,iws:iwe,jws:jwe) &
+        + gdz(n,i,j) * self % kernel(n,k,:,:)
 
     end do
 
diff --git a/src/nf/nf_layer_constructors.f90 b/src/nf/nf_layer_constructors.f90
@@ -94,7 +94,7 @@ end function input3d
 
   interface conv
 
-    module function conv1d(filters, kernel_width, activation) result(res)
+    module function conv1d(filters, kernel_width, activation, stride) result(res)
       !! 1-d convolutional layer constructor.
       !!
       !! This layer is for building 1-d convolutional network.
@@ -117,11 +117,13 @@ module function conv1d(filters, kernel_width, activation) result(res)
         !! Width of the convolution window, commonly 3 or 5
       class(activation_function), intent(in), optional :: activation
         !! Activation function (default sigmoid)
+      integer, intent(in), optional :: stride
+        !! Stride length of the convolution
       type(layer) :: res
         !! Resulting layer instance
     end function conv1d
 
-    module function conv2d(filters, kernel_width, kernel_height, activation) result(res)
+    module function conv2d(filters, kernel_width, kernel_height, activation, stride) result(res)
       !! 2-d convolutional layer constructor.
       !!
       !! This layer is for building 2-d convolutional network.
@@ -147,6 +149,8 @@ module function conv2d(filters, kernel_width, kernel_height, activation) result(
         !! Height of the convolution window, commonly 3 or 5
       class(activation_function), intent(in), optional :: activation
         !! Activation function (default sigmoid)
+      integer, intent(in), optional :: stride(:)
+        !! Stride length of the convolution
       type(layer) :: res
         !! Resulting layer instance
     end function conv2d
diff --git a/src/nf/nf_layer_constructors_submodule.f90 b/src/nf/nf_layer_constructors_submodule.f90
@@ -23,12 +23,14 @@
 
 contains
 
-  module function conv1d(filters, kernel_width, activation) result(res)
+  module function conv1d(filters, kernel_width, activation, stride) result(res)
     integer, intent(in) :: filters
     integer, intent(in) :: kernel_width
     class(activation_function), intent(in), optional :: activation
+    integer, intent(in), optional :: stride
     type(layer) :: res
 
+    integer :: stride_tmp
     class(activation_function), allocatable :: activation_tmp
 
     res % name = 'conv1d'
@@ -41,20 +43,31 @@ module function conv1d(filters, kernel_width, activation) result(res)
 
     res % activation = activation_tmp % get_name()
 
+    if (present(stride)) then
+      stride_tmp = stride
+    else
+      stride_tmp = 1
+    endif
+
+    if (stride_tmp < 1) &
+      error stop 'stride must be >= 1 in a conv1d layer'
+
     allocate( &
       res % p, &
-      source=conv1d_layer(filters, kernel_width, activation_tmp) &
+      source=conv1d_layer(filters, kernel_width, activation_tmp, stride_tmp) &
     )
 
   end function conv1d
 
-  module function conv2d(filters, kernel_width, kernel_height, activation) result(res)
+  module function conv2d(filters, kernel_width, kernel_height, activation, stride) result(res)
     integer, intent(in) :: filters
     integer, intent(in) :: kernel_width
     integer, intent(in) :: kernel_height
     class(activation_function), intent(in), optional :: activation
+    integer, intent(in), optional :: stride(:)
     type(layer) :: res
 
+    integer, allocatable :: stride_tmp(:)
     class(activation_function), allocatable :: activation_tmp
 
     ! Enforce kernel_width == kernel_height for now;
@@ -73,9 +86,21 @@ module function conv2d(filters, kernel_width, kernel_height, activation) result(
 
     res % activation = activation_tmp % get_name()
 
+    if (present(stride)) then
+      stride_tmp = stride
+    else
+      stride_tmp = [1, 1]
+    endif
+
+    if (size(stride_tmp) /= 2 ) &
+      error stop 'size of stride must be equal to 2 in a conv2d layer'
+
+    if (stride_tmp(1) < 1 .or. stride_tmp(2) < 1) &
+      error stop 'stride must be >= 1 in a conv2d layer'
+
     allocate( &
       res % p, &
-      source=conv2d_layer(filters, kernel_width, activation_tmp) &
+      source=conv2d_layer(filters, kernel_width, activation_tmp, stride_tmp) &
     )
 
   end function conv2d
diff --git a/test/test_conv1d_layer.f90 b/test/test_conv1d_layer.f90
@@ -58,6 +58,7 @@ program test_conv1d_layer
     select type(this_layer => input_layer % p); type is(input2d_layer)
       call this_layer % set(sample_input)
     end select
+    deallocate(sample_input)
   
     call conv1d_layer % forward(input_layer)
     call conv1d_layer % get_output(output)
@@ -67,11 +68,33 @@ program test_conv1d_layer
       write(stderr, '(a)') 'conv1d layer with zero input and sigmoid function must forward to all 0.5.. failed'
     end if
   
+    ! Minimal conv1d layer: 1 channel, 3x3 pixel image, stride = 3;
+    allocate(sample_input(1, 17))
+    sample_input = 0
+  
+    input_layer = input(1, 17)
+    conv1d_layer = conv(filters, kernel_size, stride = 3)
+    call conv1d_layer % init(input_layer)
+  
+    select type(this_layer => input_layer % p); type is(input2d_layer)
+      call this_layer % set(sample_input)
+    end select
+    deallocate(sample_input)
+  
+    call conv1d_layer % forward(input_layer)
+    call conv1d_layer % get_output(output)
+  
+    if (.not. all(abs(output) < tolerance)) then
+      ok = .false.
+      write(stderr, '(a)') 'conv1d layer with zero input and sigmoid function must forward to all 0.5.. failed'
+    end if
+  
+    !Final
     if (ok) then
       print '(a)', 'test_conv1d_layer: All tests passed.'
     else
       write(stderr, '(a)') 'test_conv1d_layer: One or more tests failed.'
       stop 1
     end if
-  
+
 end program test_conv1d_layer
diff --git a/test/test_conv2d_layer.f90 b/test/test_conv2d_layer.f90