From a38539c6a098d1aff1a31fca65e05a99a61b4d67 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 3 Dec 2024 21:50:48 +0000
Subject: [PATCH 1/9] Initial implementation of 1D symmetrical filtering for
 gaussian

---
 src/dynimage.rs           |   4 +-
 src/imageops/filter_1d.rs | 967 ++++++++++++++++++++++++++++++++++++++
 src/imageops/mod.rs       |   5 +-
 src/imageops/sample.rs    | 197 +++++++-
 4 files changed, 1167 insertions(+), 6 deletions(-)
 create mode 100644 src/imageops/filter_1d.rs
diff --git a/src/dynimage.rs b/src/dynimage.rs
index e04944ad5e..ee0d15327a 100644
--- a/src/dynimage.rs
+++ b/src/dynimage.rs
@@ -15,6 +15,7 @@ use crate::error::{ImageError, ImageResult, ParameterError, ParameterErrorKind};
 use crate::flat::FlatSamples;
 use crate::image::{GenericImage, GenericImageView, ImageDecoder, ImageEncoder, ImageFormat};
 use crate::image_reader::free_functions;
+use crate::imageops::gaussian_blur_dyn_image;
 use crate::math::resize_dimensions;
 use crate::metadata::Orientation;
 use crate::traits::Pixel;
@@ -829,7 +830,8 @@ impl DynamicImage {
     /// accurate version.
     #[must_use]
     pub fn blur(&self, sigma: f32) -> DynamicImage {
-        dynamic_map!(*self, ref p => imageops::blur(p, sigma))
+        gaussian_blur_dyn_image(self, sigma)
+        // dynamic_map!(*self, ref p => imageops::blur(p, sigma))
     }
 
     /// Performs a fast blur on this image.
diff --git a/src/imageops/filter_1d.rs b/src/imageops/filter_1d.rs
new file mode 100644
index 0000000000..d70cf25221
--- /dev/null
+++ b/src/imageops/filter_1d.rs
@@ -0,0 +1,967 @@
+#![forbid(unsafe_code)]
+use crate::error::{LimitError, LimitErrorKind};
+use crate::ImageError;
+use num_traits::{AsPrimitive, MulAdd};
+use std::ops::{Add, Mul};
+
+#[cfg(any(
+    all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "fma"
+    ),
+    all(target_arch = "aarch64", target_feature = "neon")
+))]
+#[inline(always)]
+/// Uses fused multiply add when available
+///
+/// It is important not to call it if FMA flag is not turned on,
+/// Rust compiles extremely slow case in this case (probably does software FMA?),
+/// and we want to avoid this
+fn mla<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
+    acc: T,
+    a: T,
+    b: T,
+) -> T {
+    MulAdd::mul_add(a, b, acc)
+}
+
+#[inline(always)]
+#[cfg(not(any(
+    all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "fma"
+    ),
+    all(target_arch = "aarch64", target_feature = "neon")
+)))]
+fn mla<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
+    acc: T,
+    a: T,
+    b: T,
+) -> T {
+    acc + a * b
+}
+
+struct RowArena<T> {
+    arena: Vec<T>,
+}
+
+#[derive(Debug, Clone, Copy)]
+struct KernelShape {
+    width: usize,
+    height: usize,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct FilterImageSize {
+    pub(crate) width: usize,
+    pub(crate) height: usize,
+}
+
+/// Pads an image row with *clamp* strategy
+///
+/// This method copies real content into center of new buffer
+/// and filling leading and trailing physical padded parts with *clamp* strategy
+fn make_arena_row<T, const COMPONENTS: usize>(
+    image: &[T],
+    source_y: usize,
+    image_size: FilterImageSize,
+    kernel_size: KernelShape,
+) -> RowArena<T>
+where
+    T: Default + Copy + Send + Sync + 'static,
+    f64: AsPrimitive<T>,
+{
+    assert_eq!(
+        image.len(),
+        COMPONENTS * image_size.width * image_size.height
+    );
+
+    let pad_w = kernel_size.width / 2;
+
+    let arena_width = image_size.width * COMPONENTS + pad_w * 2 * COMPONENTS;
+    let mut row = vec![T::default(); arena_width];
+
+    let source_offset = source_y * image_size.width * COMPONENTS;
+
+    let source_row = &image[source_offset..(source_offset + image_size.width * COMPONENTS)];
+
+    let row_dst =
+        &mut row[pad_w * COMPONENTS..(pad_w * COMPONENTS + image_size.width * COMPONENTS)];
+
+    for (dst, src) in row_dst.iter_mut().zip(source_row.iter()) {
+        *dst = *src;
+    }
+
+    for (x, dst) in (0..pad_w).zip(row.chunks_exact_mut(COMPONENTS)) {
+        let old_x = x.saturating_sub(pad_w).min(image_size.width - 1);
+        let old_px = old_x * COMPONENTS;
+        let src_iter = &source_row[old_px..(old_px + COMPONENTS)];
+        for (dst, src) in dst.iter_mut().zip(src_iter.iter()) {
+            *dst = *src;
+        }
+    }
+
+    for (x, dst) in
+        (image_size.width..(image_size.width + pad_w)).zip(row.chunks_exact_mut(COMPONENTS).rev())
+    {
+        let old_x = x.max(0).min(image_size.width - 1);
+        let old_px = old_x * COMPONENTS;
+        let src_iter = &source_row[old_px..(old_px + COMPONENTS)];
+        for (dst, src) in dst.iter_mut().zip(src_iter.iter()) {
+            *dst = *src;
+        }
+    }
+
+    RowArena { arena: row }
+}
+
+#[derive(Clone)]
+struct ArenaColumns<T>
+where
+    T: Copy,
+{
+    pub top_pad: Vec<T>,
+    pub bottom_pad: Vec<T>,
+}
+
+/// Pads a column image with *clamp* strategy
+///
+/// This method is used for *virtual* column padding.
+/// It produces two arrays that represents virtual image top part and bottom
+fn make_columns_arenas<T, const COMPONENTS: usize>(
+    image: &[T],
+    image_size: FilterImageSize,
+    kernel_size: KernelShape,
+) -> ArenaColumns<T>
+where
+    T: Default + Copy + Send + Sync + 'static,
+    f64: AsPrimitive<T>,
+{
+    assert_eq!(
+        image.len(),
+        COMPONENTS * image_size.width * image_size.height
+    );
+    let pad_h = kernel_size.height / 2;
+
+    let mut top_pad = vec![T::default(); pad_h * image_size.width * COMPONENTS];
+    let mut bottom_pad = vec![T::default(); pad_h * image_size.width * COMPONENTS];
+
+    let top_pad_stride = image_size.width * COMPONENTS;
+
+    for (ky, dst) in (0..pad_h).zip(top_pad.chunks_exact_mut(top_pad_stride)) {
+        for (kx, dst) in (0..image_size.width).zip(dst.chunks_exact_mut(COMPONENTS)) {
+            let y = ky.saturating_sub(pad_h).min(image_size.height - 1);
+            let v_src = y * top_pad_stride + kx * COMPONENTS;
+
+            let src_iter = &image[v_src..(v_src + COMPONENTS)];
+            for (dst, src) in dst.iter_mut().zip(src_iter.iter()) {
+                *dst = *src;
+            }
+        }
+    }
+
+    let bottom_iter_dst = bottom_pad.chunks_exact_mut(top_pad_stride);
+
+    for (ky, dst) in (0..pad_h).zip(bottom_iter_dst) {
+        for (kx, dst) in (0..image_size.width).zip(dst.chunks_exact_mut(COMPONENTS)) {
+            let y = (ky + image_size.height).min(image_size.height - 1);
+            let v_src = y * top_pad_stride + kx * COMPONENTS;
+            let src_iter = &image[v_src..(v_src + COMPONENTS)];
+            for (dst, src) in dst.iter_mut().zip(src_iter.iter()) {
+                *dst = *src;
+            }
+        }
+    }
+
+    ArenaColumns {
+        top_pad,
+        bottom_pad,
+    }
+}
+
+trait ToStorage<S> {
+    fn to_(self) -> S;
+}
+
+const PRECISION: i32 = 15;
+
+impl ToStorage<u8> for i32 {
+    #[inline(always)]
+    #[allow(clippy::manual_clamp)]
+    fn to_(self) -> u8 {
+        ((self + (1 << (PRECISION - 1))) >> PRECISION)
+            .min(255)
+            .max(0) as u8
+    }
+}
+
+impl ToStorage<u16> for f32 {
+    #[inline(always)]
+    fn to_(self) -> u16 {
+        self.round().min(u16::MAX as f32).max(0f32) as u16
+    }
+}
+
+impl ToStorage<f32> for f32 {
+    #[inline(always)]
+    fn to_(self) -> f32 {
+        self
+    }
+}
+
+#[derive(Copy, Clone)]
+struct Arena<'a, T> {
+    src: &'a [T],
+    components: usize,
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Ord, PartialOrd, Eq, PartialEq, Copy)]
+struct ColorGroup<const COMPS: usize, J: Copy> {
+    r: J,
+    g: J,
+    b: J,
+    a: J,
+}
+
+macro_rules! ld_group {
+    ($store: expr, $channels: expr, $offset: expr) => {{
+        if $channels == 1 {
+            ColorGroup {
+                r: $store[$offset].as_(),
+                g: 0.as_(),
+                b: 0.as_(),
+                a: 0.as_(),
+            }
+        } else if $channels == 2 {
+            ColorGroup {
+                r: $store[$offset].as_(),
+                g: $store[$offset + 1].as_(),
+                b: 0.as_(),
+                a: 0.as_(),
+            }
+        } else if $channels == 3 {
+            ColorGroup {
+                r: $store[$offset].as_(),
+                g: $store[$offset + 1].as_(),
+                b: $store[$offset + 2].as_(),
+                a: 0.as_(),
+            }
+        } else if $channels == 4 {
+            ColorGroup {
+                r: $store[$offset].as_(),
+                g: $store[$offset + 1].as_(),
+                b: $store[$offset + 2].as_(),
+                a: $store[$offset + 3].as_(),
+            }
+        } else {
+            unimplemented!()
+        }
+    }};
+}
+
+impl<const COMPS: usize, J> ColorGroup<COMPS, J>
+where
+    J: Copy + Default,
+{
+    #[inline]
+    fn from_components(r: J, g: J, b: J, a: J) -> ColorGroup<COMPS, J> {
+        ColorGroup { r, g, b, a }
+    }
+}
+
+impl<const COMPS: usize, J> Mul<J> for ColorGroup<COMPS, J>
+where
+    J: Copy + Mul<Output = J> + Default + 'static,
+{
+    type Output = Self;
+
+    #[inline]
+    fn mul(self, rhs: J) -> Self::Output {
+        if COMPS == 1 {
+            ColorGroup::from_components(self.r * rhs, self.g, self.b, self.a)
+        } else if COMPS == 2 {
+            ColorGroup::from_components(self.r * rhs, self.g * rhs, self.b, self.a)
+        } else if COMPS == 3 {
+            ColorGroup::from_components(self.r * rhs, self.g * rhs, self.b * rhs, self.a)
+        } else if COMPS == 4 {
+            ColorGroup::from_components(self.r * rhs, self.g * rhs, self.b * rhs, self.a * rhs)
+        } else {
+            unimplemented!()
+        }
+    }
+}
+
+impl<const COMPS: usize, J> MulAdd<ColorGroup<COMPS, J>, J> for ColorGroup<COMPS, J>
+where
+    J: Copy + MulAdd<J, Output = J> + Default + 'static + Mul<J, Output = J> + Add<J, Output = J>,
+{
+    type Output = Self;
+
+    #[inline]
+    fn mul_add(self, a: ColorGroup<COMPS, J>, b: J) -> Self::Output {
+        if COMPS == 1 {
+            ColorGroup::from_components(mla(a.r, self.r, b), self.g, self.b, self.a)
+        } else if COMPS == 2 {
+            ColorGroup::from_components(mla(a.r, self.r, b), mla(a.g, self.g, b), self.b, self.a)
+        } else if COMPS == 3 {
+            ColorGroup::from_components(
+                mla(a.r, self.r, b),
+                mla(a.g, self.g, b),
+                mla(a.b, self.b, b),
+                self.a,
+            )
+        } else if COMPS == 4 {
+            ColorGroup::from_components(
+                mla(a.r, self.r, b),
+                mla(a.g, self.g, b),
+                mla(a.b, self.b, b),
+                mla(a.a, self.a, b),
+            )
+        } else {
+            unimplemented!();
+        }
+    }
+}
+
+impl<const COMPS: usize, J> Add<ColorGroup<COMPS, J>> for ColorGroup<COMPS, J>
+where
+    J: Copy + Add<Output = J> + Default + 'static,
+{
+    type Output = Self;
+
+    #[inline]
+    fn add(self, rhs: ColorGroup<COMPS, J>) -> Self::Output {
+        if COMPS == 1 {
+            ColorGroup::from_components(self.r + rhs.r, self.g, self.b, self.a)
+        } else if COMPS == 2 {
+            ColorGroup::from_components(self.r + rhs.r, self.g + rhs.g, self.b, self.a)
+        } else if COMPS == 3 {
+            ColorGroup::from_components(self.r + rhs.r, self.g + rhs.g, self.b + rhs.b, self.a)
+        } else if COMPS == 4 {
+            ColorGroup::from_components(
+                self.r + rhs.r,
+                self.g + rhs.g,
+                self.b + rhs.b,
+                self.a + rhs.a,
+            )
+        } else {
+            unimplemented!();
+        }
+    }
+}
+
+macro_rules! st_group {
+    ($store: expr, $channels: expr, $offset: expr, $vl: expr) => {{
+        if $channels == 1 {
+            $store[$offset] = $vl.r.to_();
+        } else if $channels == 2 {
+            $store[$offset] = $vl.r.to_();
+            $store[$offset + 1] = $vl.g.to_();
+        } else if $channels == 3 {
+            $store[$offset] = $vl.r.to_();
+            $store[$offset + 1] = $vl.g.to_();
+            $store[$offset + 2] = $vl.b.to_();
+        } else if $channels == 4 {
+            $store[$offset] = $vl.r.to_();
+            $store[$offset + 1] = $vl.g.to_();
+            $store[$offset + 2] = $vl.b.to_();
+            $store[$offset + 3] = $vl.a.to_();
+        } else {
+            unimplemented!()
+        }
+    }};
+}
+
+/// Performs column convolution pass for symmetrical filter
+///
+/// Common convolution formula O(x,y)=∑K(k)⋅I(x,y+k); where sums goes from 0...R
+/// when filter is symmetric that we can half kernel reads by using formula
+/// O(x,y)=(∑K(k)⋅(I(x,y+k) + I(x,y+(R-k)))) + K(R/2)⋅I(x,y+R/2); where sums goes from 0...R/2
+fn filter_symmetric_column<T, F>(
+    arena: Arena<T>,
+    arena_src: &[&[T]],
+    dst_row: &mut [T],
+    image_size: FilterImageSize,
+    kernel: &[F],
+) where
+    T: Copy + AsPrimitive<F>,
+    F: ToStorage<T>
+        + Mul<F, Output = F>
+        + MulAdd<F, Output = F>
+        + Add<F, Output = F>
+        + Default
+        + Copy
+        + 'static,
+{
+    let dst_stride = image_size.width * arena.components;
+
+    let length = kernel.len();
+    let half_len = length / 2;
+
+    let mut cx = 0usize;
+
+    while cx + 32 < dst_stride {
+        let coeff = kernel[half_len];
+
+        let mut store: [F; 32] = [F::default(); 32];
+
+        let v_src = &arena_src[half_len][cx..(cx + 32)];
+
+        for (dst, src) in store.iter_mut().zip(v_src) {
+            *dst = src.as_().mul(coeff);
+        }
+
+        for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
+            let rollback = length - i - 1;
+            let fw = &arena_src[i][cx..(cx + 32)];
+            let bw = &arena_src[rollback][cx..(cx + 32)];
+
+            for ((dst, fw), bw) in store.iter_mut().zip(fw).zip(bw) {
+                *dst = mla(*dst, fw.as_().add(bw.as_()), coeff);
+            }
+        }
+
+        let shaped_dst = &mut dst_row[cx..(cx + 32)];
+        for (src, dst) in store.iter().zip(shaped_dst.iter_mut()) {
+            *dst = src.to_();
+        }
+
+        cx += 32;
+    }
+
+    while cx + 16 < dst_stride {
+        let coeff = kernel[half_len];
+
+        let mut store: [F; 16] = [F::default(); 16];
+
+        let v_src = &arena_src[half_len][cx..(cx + 16)];
+
+        for (dst, src) in store.iter_mut().zip(v_src) {
+            *dst = src.as_().mul(coeff);
+        }
+
+        for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
+            let rollback = length - i - 1;
+            let fw = &arena_src[i][cx..(cx + 16)];
+            let bw = &arena_src[rollback][cx..(cx + 16)];
+
+            for ((dst, fw), bw) in store.iter_mut().zip(fw).zip(bw) {
+                *dst = mla(*dst, fw.as_().add(bw.as_()), coeff);
+            }
+        }
+
+        let shaped_dst = &mut dst_row[cx..(cx + 16)];
+        for (src, dst) in store.iter().zip(shaped_dst.iter_mut()) {
+            *dst = src.to_();
+        }
+
+        cx += 16;
+    }
+
+    while cx + 4 < dst_stride {
+        let coeff = kernel[half_len];
+
+        let v_src = &arena_src[half_len][cx..(cx + 4)];
+
+        let mut k0 = v_src[0].as_().mul(coeff);
+        let mut k1 = v_src[1].as_().mul(coeff);
+        let mut k2 = v_src[2].as_().mul(coeff);
+        let mut k3 = v_src[3].as_().mul(coeff);
+
+        for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
+            let rollback = length - i - 1;
+            let fw = &arena_src[i][cx..(cx + 4)];
+            let bw = &arena_src[rollback][cx..(cx + 4)];
+            k0 = mla(k0, fw[0].as_().add(bw[0].as_()), coeff);
+            k1 = mla(k1, fw[1].as_().add(bw[1].as_()), coeff);
+            k2 = mla(k2, fw[2].as_().add(bw[2].as_()), coeff);
+            k3 = mla(k3, fw[3].as_().add(bw[3].as_()), coeff);
+        }
+
+        let shaped_dst = &mut dst_row[cx..(cx + 4)];
+        shaped_dst[0] = k0.to_();
+        shaped_dst[1] = k1.to_();
+        shaped_dst[2] = k2.to_();
+        shaped_dst[3] = k3.to_();
+        cx += 4;
+    }
+
+    #[allow(clippy::needless_range_loop)]
+    for x in cx..dst_stride {
+        let coeff = kernel[half_len];
+
+        let v_src = &arena_src[half_len][x..(x + 1)];
+
+        let mut k0 = v_src[0].as_().mul(coeff);
+
+        for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
+            let rollback = length - i - 1;
+            let fw = &arena_src[i][x..(x + 1)];
+            let bw = &arena_src[rollback][x..(x + 1)];
+            k0 = mla(k0, fw[0].as_().add(bw[0].as_()), coeff);
+        }
+
+        dst_row[x] = k0.to_();
+    }
+}
+
+/// Performs horizontal convolution for row
+///
+/// Common convolution formula O(x,y)=∑K(k)⋅I(x+k,y); where sums goes from 0...R
+/// when filter is symmetric that we can half kernel reads by using formula
+/// O(x,y)=(∑K(k)⋅(I(x+k,y) + I(x+(R-k),y))) + K(R/2)⋅I(x+R/2,y); where sums goes from 0...R/2
+///
+/// It is important to use color groups because for whole group one weight must be applied
+fn filter_symmetric_row<T, F, const N: usize>(
+    arena: Arena<T>,
+    dst_row: &mut [T],
+    image_size: FilterImageSize,
+    scanned_kernel: &[F],
+) where
+    T: Copy + AsPrimitive<F> + Default,
+    F: ToStorage<T>
+        + Mul<Output = F>
+        + MulAdd<F, Output = F>
+        + Default
+        + Add<F, Output = F>
+        + Copy
+        + 'static,
+    i32: AsPrimitive<F>,
+{
+    let width = image_size.width;
+
+    let src = arena.src;
+
+    let length = scanned_kernel.len();
+    let half_len = length / 2;
+
+    let mut cx = 0usize;
+
+    while cx + 8 < width {
+        let v_cx = cx * N;
+        let src = &src[cx * N..(v_cx + length * N + N * 8)];
+        let coeff = scanned_kernel[half_len];
+
+        let mut k0: ColorGroup<N, F> = ld_group!(src, N, half_len * N).mul(coeff);
+        let mut k1: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N).mul(coeff);
+        let mut k2: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N * 2).mul(coeff);
+        let mut k3: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N * 3).mul(coeff);
+        let mut k4: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N * 4).mul(coeff);
+        let mut k5: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N * 5).mul(coeff);
+        let mut k6: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N * 6).mul(coeff);
+        let mut k7: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N * 7).mul(coeff);
+
+        for (i, &coeff) in scanned_kernel.iter().take(half_len).enumerate() {
+            let rollback = length - i - 1;
+            let fw = &src[(i * N)..((i + 8) * N)];
+            let bw = &src[(rollback * N)..((rollback + 8) * N)];
+            k0 = ld_group!(fw, N, 0)
+                .add(ld_group!(bw, N, 0))
+                .mul_add(k0, coeff);
+            k1 = ld_group!(fw, N, N)
+                .add(ld_group!(bw, N, N))
+                .mul_add(k1, coeff);
+            k2 = ld_group!(fw, N, 2 * N)
+                .add(ld_group!(bw, N, 2 * N))
+                .mul_add(k2, coeff);
+            k3 = ld_group!(fw, N, 3 * N)
+                .add(ld_group!(bw, N, 3 * N))
+                .mul_add(k3, coeff);
+            k4 = ld_group!(fw, N, 4 * N)
+                .add(ld_group!(bw, N, 4 * N))
+                .mul_add(k4, coeff);
+            k5 = ld_group!(fw, N, 5 * N)
+                .add(ld_group!(bw, N, 5 * N))
+                .mul_add(k5, coeff);
+            k6 = ld_group!(fw, N, 6 * N)
+                .add(ld_group!(bw, N, 6 * N))
+                .mul_add(k6, coeff);
+            k7 = ld_group!(fw, N, 7 * N)
+                .add(ld_group!(bw, N, 7 * N))
+                .mul_add(k7, coeff);
+        }
+
+        let dst_offset = cx * N;
+
+        let shaped_dst = &mut dst_row[dst_offset..(dst_offset + N * 8)];
+        st_group!(shaped_dst, N, 0, k0);
+        st_group!(shaped_dst, N, N, k1);
+        st_group!(shaped_dst, N, N * 2, k2);
+        st_group!(shaped_dst, N, N * 3, k3);
+        st_group!(shaped_dst, N, N * 4, k4);
+        st_group!(shaped_dst, N, N * 5, k5);
+        st_group!(shaped_dst, N, N * 6, k6);
+        st_group!(shaped_dst, N, N * 7, k7);
+        cx += 8;
+    }
+
+    while cx + 4 < width {
+        let v_cx = cx * N;
+        let src = &src[cx * N..(v_cx + length * N + N * 4)];
+        let coeff = scanned_kernel[half_len];
+
+        let mut k0: ColorGroup<N, F> = ld_group!(src, N, half_len * N).mul(coeff);
+        let mut k1: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N).mul(coeff);
+        let mut k2: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N * 2).mul(coeff);
+        let mut k3: ColorGroup<N, F> = ld_group!(src, N, half_len * N + N * 3).mul(coeff);
+
+        for (i, &coeff) in scanned_kernel.iter().take(half_len).enumerate() {
+            let rollback = length - i - 1;
+            let fw = &src[(i * N)..((i + 4) * N)];
+            let bw = &src[(rollback * N)..((rollback + 4) * N)];
+            k0 = ld_group!(fw, N, 0)
+                .add(ld_group!(bw, N, 0))
+                .mul_add(k0, coeff);
+            k1 = ld_group!(fw, N, N)
+                .add(ld_group!(bw, N, N))
+                .mul_add(k1, coeff);
+            k2 = ld_group!(fw, N, 2 * N)
+                .add(ld_group!(bw, N, 2 * N))
+                .mul_add(k2, coeff);
+            k3 = ld_group!(fw, N, 3 * N)
+                .add(ld_group!(bw, N, 3 * N))
+                .mul_add(k3, coeff);
+        }
+
+        let dst_offset = cx * N;
+        let shaped_dst = &mut dst_row[dst_offset..(dst_offset + N * 4)];
+        st_group!(shaped_dst, N, 0, k0);
+        st_group!(shaped_dst, N, N, k1);
+        st_group!(shaped_dst, N, N * 2, k2);
+        st_group!(shaped_dst, N, N * 3, k3);
+        cx += 4;
+    }
+
+    #[allow(clippy::needless_range_loop)]
+    for x in cx..width {
+        let v_cx = x * N;
+        let src = &src[v_cx..(v_cx + length * N)];
+        let coeff = scanned_kernel[half_len];
+
+        let mut k0: ColorGroup<N, F> = ld_group!(src, N, half_len * N).mul(coeff);
+
+        for (i, &coeff) in scanned_kernel.iter().take(half_len).enumerate() {
+            let rollback = length - i - 1;
+            let fw = &src[(i * N)..((i + 1) * N)];
+            let bw = &src[(rollback * N)..((rollback + 1) * N)];
+            k0 = ld_group!(fw, N, 0)
+                .add(ld_group!(bw, N, 0))
+                .mul_add(k0, coeff);
+        }
+
+        let dst_offset = x * N;
+        let shaped_dst = &mut dst_row[dst_offset..(dst_offset + N)];
+        st_group!(shaped_dst, N, 0, k0);
+    }
+}
+
+trait KernelTransformer<F, I> {
+    fn transform(input: F) -> I;
+}
+
+impl KernelTransformer<f32, i32> for u8 {
+    fn transform(input: f32) -> i32 {
+        const SCALE: f32 = (1 << PRECISION) as f32;
+        (input * SCALE).round() as i32
+    }
+}
+
+impl KernelTransformer<f32, f32> for f32 {
+    fn transform(input: f32) -> f32 {
+        input
+    }
+}
+
+impl KernelTransformer<f32, f32> for u16 {
+    fn transform(input: f32) -> f32 {
+        input
+    }
+}
+
+/// Removes meaningless values from kernel preserving symmetry
+fn prepare_symmetric_kernel<I: Copy + PartialEq + 'static>(kernel: &[I]) -> Vec<I>
+where
+    i32: AsPrimitive<I>,
+{
+    let zeros: I = 0i32.as_();
+    let mut new_kernel = kernel.to_vec();
+    while new_kernel.len() > 2
+        && (new_kernel.last().unwrap().eq(&zeros) && new_kernel.first().unwrap().eq(&zeros))
+    {
+        new_kernel.remove(0);
+        new_kernel.remove(new_kernel.len() - 1);
+    }
+
+    new_kernel
+}
+
+/// Performs 2D separable convolution on the image
+///
+/// Currently implemented only for symmetrical filters
+///
+/// # Arguments
+///
+/// * `image`: Single plane image
+/// * `destination`: Destination image
+/// * `image_size`: Image size see [FilterImageSize]
+/// * `row_kernel`: Row kernel, *size must be odd*!
+/// * `column_kernel`: Column kernel, *size must be odd*!
+fn filter_1d<T, F, I, const N: usize>(
+    image: &[T],
+    destination: &mut [T],
+    image_size: FilterImageSize,
+    row_kernel: &[F],
+    column_kernel: &[F],
+) -> Result<(), ImageError>
+where
+    T: Copy + AsPrimitive<F> + Default + Send + Sync + KernelTransformer<F, I> + AsPrimitive<I>,
+    F: Default + 'static + Copy,
+    I: ToStorage<T>
+        + Mul<I, Output = I>
+        + Add<I, Output = I>
+        + MulAdd<I, Output = I>
+        + Send
+        + Sync
+        + PartialEq
+        + Default
+        + 'static
+        + Copy,
+    i32: AsPrimitive<F> + AsPrimitive<I>,
+    f64: AsPrimitive<T>,
+{
+    if image.len() != image_size.width * image_size.height * N {
+        return Err(ImageError::Limits(LimitError::from_kind(
+            LimitErrorKind::DimensionError,
+        )));
+    }
+    if destination.len() != image_size.width * image_size.height * N {
+        return Err(ImageError::Limits(LimitError::from_kind(
+            LimitErrorKind::DimensionError,
+        )));
+    }
+
+    assert_ne!(row_kernel.len() & 1, 0, "Row kernel length must be odd");
+    assert_ne!(
+        column_kernel.len() & 1,
+        0,
+        "Column kernel length must be odd"
+    );
+
+    let mut scanned_row_kernel = row_kernel
+        .iter()
+        .map(|&x| T::transform(x))
+        .collect::<Vec<I>>();
+    let mut scanned_column_kernel = column_kernel
+        .iter()
+        .map(|&x| T::transform(x))
+        .collect::<Vec<I>>();
+
+    scanned_row_kernel = prepare_symmetric_kernel(&scanned_row_kernel);
+    scanned_column_kernel = prepare_symmetric_kernel(&scanned_column_kernel);
+
+    let zeros: I = 0i32.as_();
+
+    if scanned_row_kernel.is_empty()
+        || scanned_column_kernel.is_empty()
+        || (scanned_row_kernel.len() == 1 && scanned_row_kernel.first().unwrap().eq(&zeros))
+        || (scanned_column_kernel.len() == 1 && scanned_column_kernel.first().unwrap().eq(&zeros))
+    {
+        destination.fill(T::default());
+        return Ok(());
+    }
+
+    let mut transient_image = vec![T::default(); image_size.width * image_size.height * N];
+
+    for (y, dst) in transient_image
+        .chunks_exact_mut(image_size.width * N)
+        .enumerate()
+    {
+        // This is important to perform padding before convolution.
+        // That approach allows to iterate without unnecessary branching
+        // and highly effective from low sized kernels to reasonable huge kernels.
+        // It is especially more effective for low sized kernels than copying
+        // specifically for big sized images.
+        // If image row is `asdfgh` then this method with clamp will produce row
+        // `aaa asdfgh hhh` padded exactly on half kernel size
+        let arena = make_arena_row::<T, N>(
+            image,
+            y,
+            image_size,
+            KernelShape {
+                width: row_kernel.len(),
+                height: 0,
+            },
+        );
+
+        filter_symmetric_row::<T, I, N>(
+            Arena {
+                src: &arena.arena,
+                components: N,
+            },
+            dst,
+            image_size,
+            &scanned_row_kernel,
+        );
+    }
+
+    let column_kernel_shape = KernelShape {
+        width: 0,
+        height: scanned_column_kernel.len(),
+    };
+
+    // This is important to perform padding before convolution.
+    // That approach allows to iterate without unnecessary branching
+    // and highly effective from low sized kernels to reasonable huge kernels.
+    // It is especially more effective for low sized kernels than copying
+    // specifically for big sized images.
+    // `This is virtual padding!` that means it produces two non-contiguous arrays.
+    // They will virtually replace image row, when convolution kernel goes out of bounds on Y coordinate.
+    let column_arena_k =
+        make_columns_arenas::<T, N>(transient_image.as_slice(), image_size, column_kernel_shape);
+
+    let top_pad = column_arena_k.top_pad.as_slice();
+    let bottom_pad = column_arena_k.bottom_pad.as_slice();
+
+    let pad_h = column_kernel_shape.height / 2;
+
+    let transient_image_slice = transient_image.as_slice();
+
+    let src_stride = image_size.width * N;
+
+    for (y, dst) in destination
+        .chunks_exact_mut(image_size.width * N)
+        .enumerate()
+    {
+        let mut brows: Vec<&[T]> = vec![&transient_image_slice[0..]; column_kernel_shape.height];
+
+        for (k, row) in (0..column_kernel_shape.height).zip(brows.iter_mut()) {
+            if (y as i64 - pad_h as i64 + k as i64) < 0 {
+                *row = &top_pad[(pad_h - k - 1) * src_stride..];
+            } else if (y as i64 - pad_h as i64 + k as i64) as usize >= image_size.height {
+                *row = &bottom_pad[(k - pad_h - 1) * src_stride..];
+            } else {
+                let fy = (y as i64 + k as i64 - pad_h as i64) as usize;
+                let start_offset = src_stride * fy;
+                *row = &transient_image_slice[start_offset..(start_offset + src_stride)];
+            }
+        }
+
+        let empty_vec = vec![];
+
+        let brows_slice = brows.as_slice();
+
+        filter_symmetric_column(
+            Arena {
+                src: &empty_vec,
+                components: N,
+            },
+            brows_slice,
+            dst,
+            image_size,
+            &scanned_column_kernel,
+        );
+    }
+
+    Ok(())
+}
+
+pub(crate) fn filter_1d_plane(
+    image: &[u8],
+    destination: &mut [u8],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<u8, f32, i32, 1>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_la(
+    image: &[u8],
+    destination: &mut [u8],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<u8, f32, i32, 2>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_rgb(
+    image: &[u8],
+    destination: &mut [u8],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<u8, f32, i32, 3>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_rgba(
+    image: &[u8],
+    destination: &mut [u8],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<u8, f32, i32, 4>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_rgb_f32(
+    image: &[f32],
+    destination: &mut [f32],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<f32, f32, f32, 3>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_rgba_f32(
+    image: &[f32],
+    destination: &mut [f32],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<f32, f32, f32, 4>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_rgb_u16(
+    image: &[u16],
+    destination: &mut [u16],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<u16, f32, f32, 3>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_rgba_u16(
+    image: &[u16],
+    destination: &mut [u16],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<u16, f32, f32, 4>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_la_u16(
+    image: &[u16],
+    destination: &mut [u16],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<u16, f32, f32, 2>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_plane_u16(
+    image: &[u16],
+    destination: &mut [u16],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<u16, f32, f32, 1>(image, destination, image_size, row_kernel, column_kernel)
+}
diff --git a/src/imageops/mod.rs b/src/imageops/mod.rs
index 61a742e038..82b46133e5 100644
--- a/src/imageops/mod.rs
+++ b/src/imageops/mod.rs
@@ -17,8 +17,8 @@ pub use self::affine::{
 
 /// Image sampling
 pub use self::sample::{
-    blur, filter3x3, interpolate_bilinear, interpolate_nearest, resize, sample_bilinear,
-    sample_nearest, thumbnail, unsharpen,
+    blur, filter3x3, gaussian_blur_dyn_image, interpolate_bilinear, interpolate_nearest, resize,
+    sample_bilinear, sample_nearest, thumbnail, unsharpen,
 };
 
 /// Color operations
@@ -32,6 +32,7 @@ mod affine;
 // https://github.com/rust-lang/rust/issues/18241
 pub mod colorops;
 mod fast_blur;
+mod filter_1d;
 mod sample;
 
 pub use fast_blur::fast_blur;
diff --git a/src/imageops/sample.rs b/src/imageops/sample.rs
index 1f6ce53a77..4dab66510d 100644
--- a/src/imageops/sample.rs
+++ b/src/imageops/sample.rs
@@ -3,14 +3,23 @@
 // See http://cs.brown.edu/courses/cs123/lectures/08_Image_Processing_IV.pdf
 // for some of the theory behind image scaling and convolution
 
-use std::f32;
-
 use num_traits::{NumCast, ToPrimitive, Zero};
+use std::f32;
+use std::ops::Mul;
 
+use crate::buffer_::{Gray16Image, GrayAlpha16Image, Rgb16Image, Rgba16Image};
 use crate::image::{GenericImage, GenericImageView};
+use crate::imageops::filter_1d::{
+    filter_1d_la, filter_1d_la_u16, filter_1d_plane, filter_1d_plane_u16, filter_1d_rgb,
+    filter_1d_rgb_f32, filter_1d_rgb_u16, filter_1d_rgba, filter_1d_rgba_f32, filter_1d_rgba_u16,
+    FilterImageSize,
+};
 use crate::traits::{Enlargeable, Pixel, Primitive};
 use crate::utils::clamp;
-use crate::{ImageBuffer, Rgba32FImage};
+use crate::{
+    DynamicImage, GrayAlphaImage, GrayImage, ImageBuffer, Rgb32FImage, RgbImage, Rgba32FImage,
+    RgbaImage,
+};
 
 /// Available Sampling Filters.
 ///
@@ -1013,6 +1022,188 @@ where
     horizontal_sample(&tmp, width, &mut method)
 }
 
+/// In previous implementation sigma means radius, which is not the same one
+pub fn gaussian_blur_dyn_image(image: &DynamicImage, radius: f32) -> DynamicImage {
+    fn get_sigma_size(kernel_size: usize) -> f32 {
+        0.3f32 * ((kernel_size as f32 - 1.) * 0.5f32 - 1f32) + 0.8f32
+    }
+
+    fn get_gaussian_kernel_1d(width: usize, sigma: f32) -> Vec<f32> {
+        let mut sum_norm: f32 = 0f32;
+        let mut kernel = vec![0f32; width];
+        let scale = 1f32 / (f32::sqrt(2f32 * f32::consts::PI) * sigma);
+        let mean = (width / 2) as f32;
+
+        for (x, weight) in kernel.iter_mut().enumerate() {
+            let new_weight =
+                f32::exp(-0.5f32 * f32::powf((x as f32 - mean) / sigma, 2.0f32)) * scale;
+            *weight = new_weight;
+            sum_norm += new_weight;
+        }
+
+        if sum_norm != 0f32 {
+            let sum_scale = 1f32 / sum_norm;
+            for weight in kernel.iter_mut() {
+                *weight = weight.mul(sum_scale);
+            }
+        }
+
+        kernel
+    }
+
+    let kernel_size = radius.max(1.) as usize * 2 + 1;
+    let sigma = get_sigma_size(kernel_size);
+    let gaussian_kernel = get_gaussian_kernel_1d(kernel_size, sigma);
+
+    let filter_image_size = FilterImageSize {
+        width: image.width() as usize,
+        height: image.height() as usize,
+    };
+
+    match image {
+        DynamicImage::ImageLuma8(img) => {
+            let mut dest_image = vec![0u8; img.len()];
+            filter_1d_plane(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageLuma8(
+                GrayImage::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+        DynamicImage::ImageLumaA8(img) => {
+            let mut dest_image = vec![0u8; img.len()];
+            filter_1d_la(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageLumaA8(
+                GrayAlphaImage::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+        DynamicImage::ImageRgb8(img) => {
+            let mut dest_image = vec![0u8; img.len()];
+            filter_1d_rgb(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageRgb8(
+                RgbImage::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+        DynamicImage::ImageRgba8(img) => {
+            let mut dest_image = vec![0u8; img.len()];
+            filter_1d_rgba(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageRgba8(
+                RgbaImage::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+        DynamicImage::ImageLuma16(img) => {
+            let mut dest_image = vec![0u16; img.len()];
+            filter_1d_plane_u16(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageLuma16(
+                Gray16Image::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+        DynamicImage::ImageLumaA16(img) => {
+            let mut dest_image = vec![0u16; img.len()];
+            filter_1d_la_u16(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageLumaA16(
+                GrayAlpha16Image::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+        DynamicImage::ImageRgb16(img) => {
+            let mut dest_image = vec![0u16; img.len()];
+            filter_1d_rgb_u16(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageRgb16(
+                Rgb16Image::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+        DynamicImage::ImageRgba16(img) => {
+            let mut dest_image = vec![0u16; img.len()];
+            filter_1d_rgba_u16(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageRgba16(
+                Rgba16Image::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+        DynamicImage::ImageRgb32F(img) => {
+            let mut dest_image = vec![0f32; img.len()];
+            filter_1d_rgb_f32(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageRgb32F(
+                Rgb32FImage::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+        DynamicImage::ImageRgba32F(img) => {
+            let mut dest_image = vec![0f32; img.len()];
+            filter_1d_rgba_f32(
+                img.as_raw(),
+                &mut dest_image,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+            DynamicImage::ImageRgba32F(
+                Rgba32FImage::from_raw(img.width(), img.height(), dest_image).unwrap(),
+            )
+        }
+    }
+}
+
 /// Performs an unsharpen mask on the supplied image.
 /// ```sigma``` is the amount to blur the image by.
 /// ```threshold``` is the threshold for minimal brightness change that will be sharpened.

From b5fa826c8608c401a667bfaa066835ce56f7ff43 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 3 Dec 2024 22:14:13 +0000
Subject: [PATCH 2/9] Loop unrolling changed

---
 src/imageops/filter_1d.rs | 70 ++++++++++++++++++++++++++++-----------
 1 file changed, 51 insertions(+), 19 deletions(-)

diff --git a/src/imageops/filter_1d.rs b/src/imageops/filter_1d.rs
index d70cf25221..f09971ea63 100644
--- a/src/imageops/filter_1d.rs
+++ b/src/imageops/filter_1d.rs
@@ -2,7 +2,9 @@
 use crate::error::{LimitError, LimitErrorKind};
 use crate::ImageError;
 use num_traits::{AsPrimitive, MulAdd};
+use std::mem::size_of;
 use std::ops::{Add, Mul};
+use std::time::Instant;
 
 #[cfg(any(
     all(
@@ -401,33 +403,55 @@ fn filter_symmetric_column<T, F>(
 
     let mut cx = 0usize;
 
-    while cx + 32 < dst_stride {
-        let coeff = kernel[half_len];
+    if size_of::<T>() == 1 {
+        while cx + 32 < dst_stride {
+            let coeff = kernel[half_len];
 
-        let mut store: [F; 32] = [F::default(); 32];
+            let mut store0: [F; 16] = [F::default(); 16];
+            let mut store1: [F; 16] = [F::default(); 16];
 
-        let v_src = &arena_src[half_len][cx..(cx + 32)];
+            let v_src0 = &arena_src[half_len][cx..(cx + 16)];
+            let v_src1 = &arena_src[half_len][(cx + 16)..(cx + 32)];
 
-        for (dst, src) in store.iter_mut().zip(v_src) {
-            *dst = src.as_().mul(coeff);
-        }
+            for (dst, src) in store0.iter_mut().zip(v_src0) {
+                *dst = src.as_().mul(coeff);
+            }
+            for (dst, src) in store1.iter_mut().zip(v_src1) {
+                *dst = src.as_().mul(coeff);
+            }
 
-        for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
-            let rollback = length - i - 1;
-            let fw = &arena_src[i][cx..(cx + 32)];
-            let bw = &arena_src[rollback][cx..(cx + 32)];
+            for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
+                let rollback = length - i - 1;
+                let fw_src = arena_src[i];
+                let rw_src = arena_src[rollback];
+                let fw0 = &fw_src[cx..(cx + 16)];
+                let bw0 = &rw_src[cx..(cx + 16)];
+                let fw1 = &fw_src[(cx + 16)..(cx + 32)];
+                let bw1 = &rw_src[(cx + 16)..(cx + 32)];
+
+                for ((dst, fw), bw) in store0.iter_mut().zip(fw0).zip(bw0) {
+                    *dst = mla(*dst, fw.as_().add(bw.as_()), coeff);
+                }
+
+                for ((dst, fw), bw) in store1.iter_mut().zip(fw1).zip(bw1) {
+                    *dst = mla(*dst, fw.as_().add(bw.as_()), coeff);
+                }
+            }
 
-            for ((dst, fw), bw) in store.iter_mut().zip(fw).zip(bw) {
-                *dst = mla(*dst, fw.as_().add(bw.as_()), coeff);
+            let shaped_dst0 = &mut dst_row[cx..(cx + 16)];
+
+            for (src, dst) in store0.iter().zip(shaped_dst0.iter_mut()) {
+                *dst = src.to_();
             }
-        }
 
-        let shaped_dst = &mut dst_row[cx..(cx + 32)];
-        for (src, dst) in store.iter().zip(shaped_dst.iter_mut()) {
-            *dst = src.to_();
-        }
+            let shaped_dst1 = &mut dst_row[(cx + 16)..(cx + 32)];
+
+            for (src, dst) in store1.iter().zip(shaped_dst1.iter_mut()) {
+                *dst = src.to_();
+            }
 
-        cx += 32;
+            cx += 32;
+        }
     }
 
     while cx + 16 < dst_stride {
@@ -773,6 +797,8 @@ where
 
     let mut transient_image = vec![T::default(); image_size.width * image_size.height * N];
 
+    let start_time = Instant::now();
+
     for (y, dst) in transient_image
         .chunks_exact_mut(image_size.width * N)
         .enumerate()
@@ -805,6 +831,8 @@ where
         );
     }
 
+    println!("Horizontal time {:?}", start_time.elapsed());
+
     let column_kernel_shape = KernelShape {
         width: 0,
         height: scanned_column_kernel.len(),
@@ -829,6 +857,8 @@ where
 
     let src_stride = image_size.width * N;
 
+    let start_time = Instant::now();
+
     for (y, dst) in destination
         .chunks_exact_mut(image_size.width * N)
         .enumerate()
@@ -863,6 +893,8 @@ where
         );
     }
 
+    println!("Vertical time {:?}", start_time.elapsed());
+
     Ok(())
 }
 

From 1c73c52f4137162f0b1cc11197f36f1d46e5f6b6 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 3 Dec 2024 22:34:11 +0000
Subject: [PATCH 3/9] Removed time

---
 src/imageops/filter_1d.rs | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/imageops/filter_1d.rs b/src/imageops/filter_1d.rs
index f09971ea63..e7bc137ec6 100644
--- a/src/imageops/filter_1d.rs
+++ b/src/imageops/filter_1d.rs
@@ -4,7 +4,6 @@ use crate::ImageError;
 use num_traits::{AsPrimitive, MulAdd};
 use std::mem::size_of;
 use std::ops::{Add, Mul};
-use std::time::Instant;
 
 #[cfg(any(
     all(
@@ -797,8 +796,6 @@ where
 
     let mut transient_image = vec![T::default(); image_size.width * image_size.height * N];
 
-    let start_time = Instant::now();
-
     for (y, dst) in transient_image
         .chunks_exact_mut(image_size.width * N)
         .enumerate()
@@ -831,8 +828,6 @@ where
         );
     }
 
-    println!("Horizontal time {:?}", start_time.elapsed());
-
     let column_kernel_shape = KernelShape {
         width: 0,
         height: scanned_column_kernel.len(),
@@ -857,8 +852,6 @@ where
 
     let src_stride = image_size.width * N;
 
-    let start_time = Instant::now();
-
     for (y, dst) in destination
         .chunks_exact_mut(image_size.width * N)
         .enumerate()
@@ -893,8 +886,6 @@ where
         );
     }
 
-    println!("Vertical time {:?}", start_time.elapsed());
-
     Ok(())
 }
 

From ed9078ab0d2962f69d168fc937ffc308885c3c65 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Thu, 5 Dec 2024 10:03:58 +0000
Subject: [PATCH 4/9] Fix clippy

---
 src/imageops/filter_1d.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/imageops/filter_1d.rs b/src/imageops/filter_1d.rs
index e7bc137ec6..8fb2a66fdc 100644
--- a/src/imageops/filter_1d.rs
+++ b/src/imageops/filter_1d.rs
@@ -121,8 +121,8 @@ struct ArenaColumns<T>
 where
     T: Copy,
 {
-    pub top_pad: Vec<T>,
-    pub bottom_pad: Vec<T>,
+    top_pad: Vec<T>,
+    bottom_pad: Vec<T>,
 }
 
 /// Pads a column image with *clamp* strategy

From 64022dcdec8f4b391ec5d78f8e67f1d43dddf157 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 17 Dec 2024 00:07:35 +0000
Subject: [PATCH 5/9] Handle generic image

---
 src/imageops/filter_1d.rs |  20 ++++
 src/imageops/mod.rs       |   3 +-
 src/imageops/sample.rs    | 232 ++++++++++++++++++++++++++++++--------
 3 files changed, 207 insertions(+), 48 deletions(-)

diff --git a/src/imageops/filter_1d.rs b/src/imageops/filter_1d.rs
index 8fb2a66fdc..2081faa562 100644
--- a/src/imageops/filter_1d.rs
+++ b/src/imageops/filter_1d.rs
@@ -929,6 +929,26 @@ pub(crate) fn filter_1d_rgba(
     filter_1d::<u8, f32, i32, 4>(image, destination, image_size, row_kernel, column_kernel)
 }
 
+pub(crate) fn filter_1d_la_f32(
+    image: &[f32],
+    destination: &mut [f32],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<f32, f32, f32, 2>(image, destination, image_size, row_kernel, column_kernel)
+}
+
+pub(crate) fn filter_1d_plane_f32(
+    image: &[f32],
+    destination: &mut [f32],
+    image_size: FilterImageSize,
+    row_kernel: &[f32],
+    column_kernel: &[f32],
+) -> Result<(), ImageError> {
+    filter_1d::<f32, f32, f32, 1>(image, destination, image_size, row_kernel, column_kernel)
+}
+
 pub(crate) fn filter_1d_rgb_f32(
     image: &[f32],
     destination: &mut [f32],
diff --git a/src/imageops/mod.rs b/src/imageops/mod.rs
index 82b46133e5..587ca6bed1 100644
--- a/src/imageops/mod.rs
+++ b/src/imageops/mod.rs
@@ -17,9 +17,10 @@ pub use self::affine::{
 
 /// Image sampling
 pub use self::sample::{
-    blur, filter3x3, gaussian_blur_dyn_image, interpolate_bilinear, interpolate_nearest, resize,
+    blur, filter3x3, interpolate_bilinear, interpolate_nearest, resize,
     sample_bilinear, sample_nearest, thumbnail, unsharpen,
 };
+pub(crate) use self::sample::gaussian_blur_dyn_image;
 
 /// Color operations
 pub use self::colorops::{
diff --git a/src/imageops/sample.rs b/src/imageops/sample.rs
index 5d3949b4d0..573cf90524 100644
--- a/src/imageops/sample.rs
+++ b/src/imageops/sample.rs
@@ -10,9 +10,9 @@ use std::ops::Mul;
 use crate::buffer_::{Gray16Image, GrayAlpha16Image, Rgb16Image, Rgba16Image};
 use crate::image::{GenericImage, GenericImageView};
 use crate::imageops::filter_1d::{
-    filter_1d_la, filter_1d_la_u16, filter_1d_plane, filter_1d_plane_u16, filter_1d_rgb,
-    filter_1d_rgb_f32, filter_1d_rgb_u16, filter_1d_rgba, filter_1d_rgba_f32, filter_1d_rgba_u16,
-    FilterImageSize,
+    filter_1d_la, filter_1d_la_f32, filter_1d_la_u16, filter_1d_plane, filter_1d_plane_f32,
+    filter_1d_plane_u16, filter_1d_rgb, filter_1d_rgb_f32, filter_1d_rgb_u16, filter_1d_rgba,
+    filter_1d_rgba_f32, filter_1d_rgba_u16, FilterImageSize,
 };
 use crate::traits::{Enlargeable, Pixel, Primitive};
 use crate::utils::clamp;
@@ -1002,58 +1002,53 @@ pub fn blur<I: GenericImageView>(
 where
     I::Pixel: 'static,
 {
-    let sigma = if sigma <= 0.0 { 1.0 } else { sigma };
+    // let sigma = if sigma <= 0.0 { 1.0 } else { sigma };
+    //
+    // let mut method = Filter {
+    //     kernel: Box::new(|x| gaussian(x, sigma)),
+    //     support: 2.0 * sigma,
+    // };
+    //
+    // let (width, height) = image.dimensions();
+    // let is_empty = width == 0 || height == 0;
+    //
+    // if is_empty {
+    //     return ImageBuffer::new(width, height);
+    // }
+    //
+    // // Keep width and height the same for horizontal and
+    // // vertical sampling.
+    // // Note: tmp is not necessarily actually Rgba
+    // let tmp: Rgba32FImage = vertical_sample(image, height, &mut method);
+    // horizontal_sample(&tmp, width, &mut method)
+    gaussian_blur_indirect(image, sigma)
+}
 
-    let mut method = Filter {
-        kernel: Box::new(|x| gaussian(x, sigma)),
-        support: 2.0 * sigma,
-    };
+fn get_gaussian_kernel_1d(width: usize, sigma: f32) -> Vec<f32> {
+    let mut sum_norm: f32 = 0f32;
+    let mut kernel = vec![0f32; width];
+    let scale = 1f32 / (f32::sqrt(2f32 * f32::consts::PI) * sigma);
+    let mean = (width / 2) as f32;
 
-    let (width, height) = image.dimensions();
-    let is_empty = width == 0 || height == 0;
+    for (x, weight) in kernel.iter_mut().enumerate() {
+        let new_weight = f32::exp(-0.5f32 * f32::powf((x as f32 - mean) / sigma, 2.0f32)) * scale;
+        *weight = new_weight;
+        sum_norm += new_weight;
+    }
 
-    if is_empty {
-        return ImageBuffer::new(width, height);
+    if sum_norm != 0f32 {
+        let sum_scale = 1f32 / sum_norm;
+        for weight in kernel.iter_mut() {
+            *weight = weight.mul(sum_scale);
+        }
     }
 
-    // Keep width and height the same for horizontal and
-    // vertical sampling.
-    // Note: tmp is not necessarily actually Rgba
-    let tmp: Rgba32FImage = vertical_sample(image, height, &mut method);
-    horizontal_sample(&tmp, width, &mut method)
+    kernel
 }
 
 /// In previous implementation sigma means radius, which is not the same one
-pub fn gaussian_blur_dyn_image(image: &DynamicImage, radius: f32) -> DynamicImage {
-    fn get_sigma_size(kernel_size: usize) -> f32 {
-        0.3f32 * ((kernel_size as f32 - 1.) * 0.5f32 - 1f32) + 0.8f32
-    }
-
-    fn get_gaussian_kernel_1d(width: usize, sigma: f32) -> Vec<f32> {
-        let mut sum_norm: f32 = 0f32;
-        let mut kernel = vec![0f32; width];
-        let scale = 1f32 / (f32::sqrt(2f32 * f32::consts::PI) * sigma);
-        let mean = (width / 2) as f32;
-
-        for (x, weight) in kernel.iter_mut().enumerate() {
-            let new_weight =
-                f32::exp(-0.5f32 * f32::powf((x as f32 - mean) / sigma, 2.0f32)) * scale;
-            *weight = new_weight;
-            sum_norm += new_weight;
-        }
-
-        if sum_norm != 0f32 {
-            let sum_scale = 1f32 / sum_norm;
-            for weight in kernel.iter_mut() {
-                *weight = weight.mul(sum_scale);
-            }
-        }
-
-        kernel
-    }
-
-    let kernel_size = radius.max(1.) as usize * 2 + 1;
-    let sigma = get_sigma_size(kernel_size);
+pub(crate) fn gaussian_blur_dyn_image(image: &DynamicImage, sigma: f32) -> DynamicImage {
+    let kernel_size = sigma.max(1.) as usize * 2 + 1;
     let gaussian_kernel = get_gaussian_kernel_1d(kernel_size, sigma);
 
     let filter_image_size = FilterImageSize {
@@ -1205,6 +1200,149 @@ pub fn gaussian_blur_dyn_image(image: &DynamicImage, radius: f32) -> DynamicImag
     }
 }
 
+pub(crate) fn gaussian_blur_indirect<I: GenericImageView>(
+    image: &I,
+    sigma: f32,
+) -> ImageBuffer<I::Pixel, Vec<<I::Pixel as Pixel>::Subpixel>>
+where
+    I::Pixel: 'static,
+{
+    match I::Pixel::CHANNEL_COUNT {
+        1 => gaussian_blur_indirect_impl::<I, 1>(image, sigma),
+        2 => gaussian_blur_indirect_impl::<I, 2>(image, sigma),
+        3 => gaussian_blur_indirect_impl::<I, 3>(image, sigma),
+        4 => gaussian_blur_indirect_impl::<I, 4>(image, sigma),
+        _ => unimplemented!(),
+    }
+}
+
+fn gaussian_blur_indirect_impl<I: GenericImageView, const CN: usize>(
+    image: &I,
+    sigma: f32,
+) -> ImageBuffer<I::Pixel, Vec<<I::Pixel as Pixel>::Subpixel>>
+where
+    I::Pixel: 'static,
+{
+    let mut transient = vec![0f32; image.width() as usize * image.height() as usize * CN];
+    for (pixel, dst) in image.pixels().zip(transient.chunks_exact_mut(CN)) {
+        let px = pixel.2.channels();
+        match CN {
+            1 => {
+                dst[0] = NumCast::from(px[0]).unwrap();
+            }
+            2 => {
+                dst[0] = NumCast::from(px[0]).unwrap();
+                dst[1] = NumCast::from(px[1]).unwrap();
+            }
+            3 => {
+                dst[0] = NumCast::from(px[0]).unwrap();
+                dst[1] = NumCast::from(px[1]).unwrap();
+                dst[2] = NumCast::from(px[2]).unwrap();
+            }
+            4 => {
+                dst[0] = NumCast::from(px[0]).unwrap();
+                dst[1] = NumCast::from(px[1]).unwrap();
+                dst[2] = NumCast::from(px[2]).unwrap();
+                dst[3] = NumCast::from(px[3]).unwrap();
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    let mut transient_dst = vec![0f32; image.width() as usize * image.height() as usize * CN];
+
+    let kernel_size = sigma.max(1.) as usize * 2 + 1;
+    let gaussian_kernel = get_gaussian_kernel_1d(kernel_size, sigma);
+
+    let filter_image_size = FilterImageSize {
+        width: image.width() as usize,
+        height: image.height() as usize,
+    };
+
+    match CN {
+        1 => {
+            filter_1d_plane_f32(
+                &transient,
+                &mut transient_dst,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+        }
+        2 => {
+            filter_1d_la_f32(
+                &transient,
+                &mut transient_dst,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+        }
+        3 => {
+            filter_1d_rgb_f32(
+                &transient,
+                &mut transient_dst,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+        }
+        4 => {
+            filter_1d_rgba_f32(
+                &transient,
+                &mut transient_dst,
+                filter_image_size,
+                &gaussian_kernel,
+                &gaussian_kernel,
+            )
+            .unwrap();
+        }
+        _ => unreachable!(),
+    }
+
+    let mut out = ImageBuffer::new(image.width(), image.height());
+    for (dst, src) in out.pixels_mut().zip(transient_dst.chunks_exact_mut(CN)) {
+        match CN {
+            1 => {
+                let v0 = NumCast::from(FloatNearest(src[0])).unwrap();
+                #[allow(deprecated)]
+                let t = Pixel::from_channels(v0, v0, v0, v0);
+                *dst = t;
+            }
+            2 => {
+                let v0 = NumCast::from(FloatNearest(src[0])).unwrap();
+                let v1 = NumCast::from(FloatNearest(src[1])).unwrap();
+                #[allow(deprecated)]
+                let t = Pixel::from_channels(v0, v1, v0, v0);
+                *dst = t;
+            }
+            3 => {
+                let v0 = NumCast::from(FloatNearest(src[0])).unwrap();
+                let v1 = NumCast::from(FloatNearest(src[1])).unwrap();
+                let v2 = NumCast::from(FloatNearest(src[2])).unwrap();
+                #[allow(deprecated)]
+                let t = Pixel::from_channels(v0, v1, v2, v0);
+                *dst = t;
+            }
+            4 => {
+                let v0 = NumCast::from(FloatNearest(src[0])).unwrap();
+                let v1 = NumCast::from(FloatNearest(src[1])).unwrap();
+                let v2 = NumCast::from(FloatNearest(src[2])).unwrap();
+                let v3 = NumCast::from(FloatNearest(src[3])).unwrap();
+                #[allow(deprecated)]
+                let t = Pixel::from_channels(v0, v1, v2, v3);
+                *dst = t;
+            }
+            _ => unreachable!(),
+        }
+    }
+
+    out
+}
+
 /// Performs an unsharpen mask on the supplied image.
 /// ```sigma``` is the amount to blur the image by.
 /// ```threshold``` is the threshold for minimal brightness change that will be sharpened.

From c87304e58a5fe1011dbc5b83d7c56db783017dc6 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 17 Dec 2024 00:14:02 +0000
Subject: [PATCH 6/9] Fix base sigma level

---
 src/imageops/sample.rs | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/imageops/sample.rs b/src/imageops/sample.rs
index 573cf90524..e66e9a5d01 100644
--- a/src/imageops/sample.rs
+++ b/src/imageops/sample.rs
@@ -1048,8 +1048,9 @@ fn get_gaussian_kernel_1d(width: usize, sigma: f32) -> Vec<f32> {
 
 /// In previous implementation sigma means radius, which is not the same one
 pub(crate) fn gaussian_blur_dyn_image(image: &DynamicImage, sigma: f32) -> DynamicImage {
-    let kernel_size = sigma.max(1.) as usize * 2 + 1;
-    let gaussian_kernel = get_gaussian_kernel_1d(kernel_size, sigma);
+    let min_sigma = sigma.max(0.1);
+    let kernel_size = min_sigma as usize * 2 + 1;
+    let gaussian_kernel = get_gaussian_kernel_1d(kernel_size, min_sigma);
 
     let filter_image_size = FilterImageSize {
         width: image.width() as usize,
@@ -1251,8 +1252,10 @@ where
 
     let mut transient_dst = vec![0f32; image.width() as usize * image.height() as usize * CN];
 
-    let kernel_size = sigma.max(1.) as usize * 2 + 1;
-    let gaussian_kernel = get_gaussian_kernel_1d(kernel_size, sigma);
+    let min_sigma = sigma.max(0.1);
+
+    let kernel_size = min_sigma as usize * 2 + 1;
+    let gaussian_kernel = get_gaussian_kernel_1d(kernel_size, min_sigma);
 
     let filter_image_size = FilterImageSize {
         width: image.width() as usize,

From 581362e06327879e97a5aab95c54e47ef0cf7d06 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 17 Dec 2024 00:15:02 +0000
Subject: [PATCH 7/9] Cargo fmt

---
 src/imageops/mod.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/imageops/mod.rs b/src/imageops/mod.rs
index 587ca6bed1..fba6be4a31 100644
--- a/src/imageops/mod.rs
+++ b/src/imageops/mod.rs
@@ -15,12 +15,12 @@ pub use self::affine::{
     rotate90, rotate90_in,
 };
 
+pub(crate) use self::sample::gaussian_blur_dyn_image;
 /// Image sampling
 pub use self::sample::{
-    blur, filter3x3, interpolate_bilinear, interpolate_nearest, resize,
-    sample_bilinear, sample_nearest, thumbnail, unsharpen,
+    blur, filter3x3, interpolate_bilinear, interpolate_nearest, resize, sample_bilinear,
+    sample_nearest, thumbnail, unsharpen,
 };
-pub(crate) use self::sample::gaussian_blur_dyn_image;
 
 /// Color operations
 pub use self::colorops::{

From fea58bd935e8766a147084d9f9ea3496e56d251c Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 17 Dec 2024 00:32:34 +0000
Subject: [PATCH 8/9] Drop unnecessary public

---
 src/imageops/sample.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/imageops/sample.rs b/src/imageops/sample.rs
index e66e9a5d01..d4dd8665b2 100644
--- a/src/imageops/sample.rs
+++ b/src/imageops/sample.rs
@@ -1201,7 +1201,7 @@ pub(crate) fn gaussian_blur_dyn_image(image: &DynamicImage, sigma: f32) -> Dynam
     }
 }
 
-pub(crate) fn gaussian_blur_indirect<I: GenericImageView>(
+fn gaussian_blur_indirect<I: GenericImageView>(
     image: &I,
     sigma: f32,
 ) -> ImageBuffer<I::Pixel, Vec<<I::Pixel as Pixel>::Subpixel>>

From b7cf70aac93e2d22f0aa0a5783e5af623bb94add Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 17 Dec 2024 00:34:29 +0000
Subject: [PATCH 9/9] Drop old methods

---
 src/dynimage.rs        |  1 -
 src/imageops/sample.rs | 24 ++----------------------
 2 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/src/dynimage.rs b/src/dynimage.rs
index ee0d15327a..a86b8a85f0 100644
--- a/src/dynimage.rs
+++ b/src/dynimage.rs
@@ -831,7 +831,6 @@ impl DynamicImage {
     #[must_use]
     pub fn blur(&self, sigma: f32) -> DynamicImage {
         gaussian_blur_dyn_image(self, sigma)
-        // dynamic_map!(*self, ref p => imageops::blur(p, sigma))
     }
 
     /// Performs a fast blur on this image.
diff --git a/src/imageops/sample.rs b/src/imageops/sample.rs
index d4dd8665b2..9b0a0743f7 100644
--- a/src/imageops/sample.rs
+++ b/src/imageops/sample.rs
@@ -1002,25 +1002,6 @@ pub fn blur<I: GenericImageView>(
 where
     I::Pixel: 'static,
 {
-    // let sigma = if sigma <= 0.0 { 1.0 } else { sigma };
-    //
-    // let mut method = Filter {
-    //     kernel: Box::new(|x| gaussian(x, sigma)),
-    //     support: 2.0 * sigma,
-    // };
-    //
-    // let (width, height) = image.dimensions();
-    // let is_empty = width == 0 || height == 0;
-    //
-    // if is_empty {
-    //     return ImageBuffer::new(width, height);
-    // }
-    //
-    // // Keep width and height the same for horizontal and
-    // // vertical sampling.
-    // // Note: tmp is not necessarily actually Rgba
-    // let tmp: Rgba32FImage = vertical_sample(image, height, &mut method);
-    // horizontal_sample(&tmp, width, &mut method)
     gaussian_blur_indirect(image, sigma)
 }
 
@@ -1046,9 +1027,8 @@ fn get_gaussian_kernel_1d(width: usize, sigma: f32) -> Vec<f32> {
     kernel
 }
 
-/// In previous implementation sigma means radius, which is not the same one
 pub(crate) fn gaussian_blur_dyn_image(image: &DynamicImage, sigma: f32) -> DynamicImage {
-    let min_sigma = sigma.max(0.1);
+    let min_sigma = sigma.max(1.0f32);
     let kernel_size = min_sigma as usize * 2 + 1;
     let gaussian_kernel = get_gaussian_kernel_1d(kernel_size, min_sigma);
 
@@ -1252,7 +1232,7 @@ where
 
     let mut transient_dst = vec![0f32; image.width() as usize * image.height() as usize * CN];
 
-    let min_sigma = sigma.max(0.1);
+    let min_sigma = sigma.max(1.0);
 
     let kernel_size = min_sigma as usize * 2 + 1;
     let gaussian_kernel = get_gaussian_kernel_1d(kernel_size, min_sigma);