diff --git a/src/asm/x86/lrf.rs b/src/asm/x86/lrf.rs index 9c9194c92c..6428e6c2c2 100644 --- a/src/asm/x86/lrf.rs +++ b/src/asm/x86/lrf.rs @@ -19,16 +19,15 @@ use std::mem; // computes an intermediate (ab) row for stripe_w + 2 columns at row y #[inline] -pub fn sgrproj_box_ab_r1( +pub fn sgrproj_box_ab_r1( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], - iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, - cpu: CpuFeatureLevel, + iimg_stride: usize, y: usize, stripe_w: usize, s: u32, cpu: CpuFeatureLevel, ) { - // only use 8-bit AVX2 assembly when bitdepth minus 8 equals 0 - if cpu >= CpuFeatureLevel::AVX2 && bdm8 == 0 { + // only use 8-bit AVX2 assembly when bitdepth equals 8 + if cpu >= CpuFeatureLevel::AVX2 && BD == 8 { // SAFETY: Calls Assembly code. return unsafe { - sgrproj_box_ab_r1_avx2( + sgrproj_box_ab_r1_avx2::( af, bf, iimg, @@ -37,12 +36,11 @@ pub fn sgrproj_box_ab_r1( y, stripe_w, s, - bdm8, ); }; } - rust::sgrproj_box_ab_r1( + rust::sgrproj_box_ab_r1::( af, bf, iimg, @@ -51,23 +49,21 @@ pub fn sgrproj_box_ab_r1( y, stripe_w, s, - bdm8, cpu, ); } // computes an intermediate (ab) row for stripe_w + 2 columns at row y #[inline] -pub fn sgrproj_box_ab_r2( +pub fn sgrproj_box_ab_r2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], - iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, - cpu: CpuFeatureLevel, + iimg_stride: usize, y: usize, stripe_w: usize, s: u32, cpu: CpuFeatureLevel, ) { - // only use 8-bit AVX2 assembly when bitdepth minus 8 equals 0 - if cpu >= CpuFeatureLevel::AVX2 && bdm8 == 0 { + // only use 8-bit AVX2 assembly when bitdepth equals 8 + if cpu >= CpuFeatureLevel::AVX2 && BD == 8 { // SAFETY: Calls Assembly code. return unsafe { - sgrproj_box_ab_r2_avx2( + sgrproj_box_ab_r2_avx2::( af, bf, iimg, @@ -76,12 +72,11 @@ pub fn sgrproj_box_ab_r2( y, stripe_w, s, - bdm8, ); }; } - rust::sgrproj_box_ab_r2( + rust::sgrproj_box_ab_r2::( af, bf, iimg, @@ -90,7 +85,6 @@ pub fn sgrproj_box_ab_r2( y, stripe_w, s, - bdm8, cpu, ); } @@ -164,10 +158,11 @@ static X_BY_XPLUS1: [u32; 256] = [ #[inline] #[target_feature(enable = "avx2")] -unsafe fn sgrproj_box_ab_8_avx2( +unsafe fn sgrproj_box_ab_8_avx2( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], - iimg_stride: usize, x: usize, y: usize, s: u32, bdm8: usize, + iimg_stride: usize, x: usize, y: usize, s: u32, ) { + let bdm8 = BD - 8; let d: usize = r * 2 + 1; let n: i32 = (d * d) as i32; let one_over_n = if r == 1 { 455 } else { 164 }; @@ -240,13 +235,13 @@ unsafe fn sgrproj_box_ab_8_avx2( } #[target_feature(enable = "avx2")] -pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( +pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], - iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, + iimg_stride: usize, y: usize, stripe_w: usize, s: u32, ) { for x in (0..stripe_w + 2).step_by(8) { if x + 8 <= stripe_w + 2 { - sgrproj_box_ab_8_avx2( + sgrproj_box_ab_8_avx2::( 1, af, bf, @@ -256,11 +251,10 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( x, y, s, - bdm8, ); } else { // finish using scalar - rust::sgrproj_box_ab_internal( + rust::sgrproj_box_ab_internal::( 1, af, bf, @@ -271,7 +265,6 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( y, stripe_w, s, - bdm8, ); } } @@ -280,7 +273,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( { let mut af_ref: Vec = vec![0; stripe_w + 2]; let mut bf_ref: Vec = vec![0; stripe_w + 2]; - rust::sgrproj_box_ab_internal( + rust::sgrproj_box_ab_internal::( 1, &mut af_ref, &mut bf_ref, @@ -291,7 +284,6 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( y, stripe_w, s, - bdm8, ); assert_eq!(&af[..stripe_w + 2], &af_ref[..]); assert_eq!(&bf[..stripe_w + 2], &bf_ref[..]); @@ -299,13 +291,13 @@ pub(crate) unsafe fn sgrproj_box_ab_r1_avx2( } #[target_feature(enable = "avx2")] -pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( +pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], - iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, + iimg_stride: usize, y: usize, stripe_w: usize, s: u32, ) { for x in (0..stripe_w + 2).step_by(8) { if x + 8 <= stripe_w + 2 { - sgrproj_box_ab_8_avx2( + sgrproj_box_ab_8_avx2::( 2, af, bf, @@ -315,11 +307,10 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( x, y, s, - bdm8, ); } else { // finish using scalar - rust::sgrproj_box_ab_internal( + rust::sgrproj_box_ab_internal::( 2, af, bf, @@ -330,7 +321,6 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( y, stripe_w, s, - bdm8, ); } } @@ -339,7 +329,7 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( { let mut af_ref: Vec = vec![0; stripe_w + 2]; let mut bf_ref: Vec = vec![0; stripe_w + 2]; - rust::sgrproj_box_ab_internal( + rust::sgrproj_box_ab_internal::( 2, &mut af_ref, &mut bf_ref, @@ -350,7 +340,6 @@ pub(crate) unsafe fn sgrproj_box_ab_r2_avx2( y, stripe_w, s, - bdm8, ); assert_eq!(&af[..stripe_w + 2], &af_ref[..]); assert_eq!(&bf[..stripe_w + 2], &bf_ref[..]); diff --git a/src/lrf.rs b/src/lrf.rs index 380a36dde6..f33a48826e 100644 --- a/src/lrf.rs +++ b/src/lrf.rs @@ -174,10 +174,9 @@ pub(crate) mod rust { use crate::Pixel; #[inline(always)] - pub(crate) fn sgrproj_box_ab_internal( + pub(crate) fn sgrproj_box_ab_internal( r: usize, af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], iimg_stride: usize, start_x: usize, y: usize, stripe_w: usize, s: u32, - bdm8: usize, ) { let d: usize = r * 2 + 1; let n: usize = d * d; @@ -187,13 +186,14 @@ pub(crate) mod rust { assert!(iimg_sq.len() > (y + d) * iimg_stride + stripe_w + d); assert!(af.len() > stripe_w); assert!(bf.len() > stripe_w); + for x in start_x..stripe_w + 2 { // SAFETY: We perform the bounds checks above, once for the whole loop unsafe { let sum = get_integral_square(iimg, iimg_stride, x, y, d); let ssq = get_integral_square(iimg_sq, iimg_stride, x, y, d); let (reta, retb) = - sgrproj_sum_finish(ssq, sum, n as u32, one_over_n, s, bdm8); + sgrproj_sum_finish::(ssq, sum, n as u32, one_over_n, s); *af.get_unchecked_mut(x) = reta; *bf.get_unchecked_mut(x) = retb; } @@ -201,12 +201,12 @@ pub(crate) mod rust { } // computes an intermediate (ab) row for stripe_w + 2 columns at row y - pub(crate) fn sgrproj_box_ab_r1( + pub(crate) fn sgrproj_box_ab_r1( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], - iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, + iimg_stride: usize, y: usize, stripe_w: usize, s: u32, _cpu: CpuFeatureLevel, ) { - sgrproj_box_ab_internal( + sgrproj_box_ab_internal::( 1, af, bf, @@ -217,17 +217,16 @@ pub(crate) mod rust { y, stripe_w, s, - bdm8, ); } // computes an intermediate (ab) row for stripe_w + 2 columns at row y - pub(crate) fn sgrproj_box_ab_r2( + pub(crate) fn sgrproj_box_ab_r2( af: &mut [u32], bf: &mut [u32], iimg: &[u32], iimg_sq: &[u32], - iimg_stride: usize, y: usize, stripe_w: usize, s: u32, bdm8: usize, + iimg_stride: usize, y: usize, stripe_w: usize, s: u32, _cpu: CpuFeatureLevel, ) { - sgrproj_box_ab_internal( + sgrproj_box_ab_internal::( 2, af, bf, @@ -238,7 +237,6 @@ pub(crate) mod rust { y, stripe_w, s, - bdm8, ); } @@ -345,9 +343,10 @@ pub(crate) mod rust { } #[inline(always)] -fn sgrproj_sum_finish( - ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32, bdm8: usize, +fn sgrproj_sum_finish( + ssq: u32, sum: u32, n: u32, one_over_n: u32, s: u32, ) -> (u32, u32) { + let bdm8 = BD - 8; let scaled_ssq = (ssq + (1 << (2 * bdm8) >> 1)) >> (2 * bdm8); let scaled_sum = (sum + (1 << bdm8 >> 1)) >> bdm8; let p = (scaled_ssq * n).saturating_sub(scaled_sum * scaled_sum); @@ -633,7 +632,6 @@ pub fn sgrproj_stripe_filter( cdeffed: &PlaneSlice, out: &mut PlaneRegionMut, ) { let &Rect { width: stripe_w, height: stripe_h, .. } = out.rect(); - let bdm8 = fi.sequence.bit_depth - 8; let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = @@ -649,6 +647,19 @@ pub fn sgrproj_stripe_filter( let s_r2: u32 = SGRPROJ_PARAMS_S[set as usize][0]; let s_r1: u32 = SGRPROJ_PARAMS_S[set as usize][1]; + let fn_ab_r1 = match fi.sequence.bit_depth { + 8 => sgrproj_box_ab_r1::<8>, + 10 => sgrproj_box_ab_r1::<10>, + 12 => sgrproj_box_ab_r1::<12>, + _ => unimplemented!(), + }; + let fn_ab_r2 = match fi.sequence.bit_depth { + 8 => sgrproj_box_ab_r2::<8>, + 10 => sgrproj_box_ab_r2::<10>, + 12 => sgrproj_box_ab_r2::<12>, + _ => unimplemented!(), + }; + /* prime the intermediate arrays */ // One oddness about the radius=2 intermediate array computations that // the spec doesn't make clear: Although the spec defines computation @@ -657,7 +668,7 @@ pub fn sgrproj_stripe_filter( let integral_image = &integral_image_buffer.integral_image; let sq_integral_image = &integral_image_buffer.sq_integral_image; if s_r2 > 0 { - sgrproj_box_ab_r2( + fn_ab_r2( &mut a_r2[0], &mut b_r2[0], integral_image, @@ -666,13 +677,12 @@ pub fn sgrproj_stripe_filter( 0, stripe_w, s_r2, - bdm8, fi.cpu_feature_level, ); } if s_r1 > 0 { let integral_image_offset = integral_image_stride + 1; - sgrproj_box_ab_r1( + fn_ab_r1( &mut a_r1[0], &mut b_r1[0], &integral_image[integral_image_offset..], @@ -681,10 +691,9 @@ pub fn sgrproj_stripe_filter( 0, stripe_w, s_r1, - bdm8, fi.cpu_feature_level, ); - sgrproj_box_ab_r1( + fn_ab_r1( &mut a_r1[1], &mut b_r1[1], &integral_image[integral_image_offset..], @@ -693,7 +702,6 @@ pub fn sgrproj_stripe_filter( 1, stripe_w, s_r1, - bdm8, fi.cpu_feature_level, ); } @@ -704,7 +712,7 @@ pub fn sgrproj_stripe_filter( for y in (0..stripe_h).step_by(2) { // get results to use y and y+1 let f_r2_ab: [&[u32]; 2] = if s_r2 > 0 { - sgrproj_box_ab_r2( + fn_ab_r2( &mut a_r2[(y / 2 + 1) % 2], &mut b_r2[(y / 2 + 1) % 2], integral_image, @@ -713,7 +721,6 @@ pub fn sgrproj_stripe_filter( y + 2, stripe_w, s_r2, - bdm8, fi.cpu_feature_level, ); let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]]; @@ -744,7 +751,7 @@ pub fn sgrproj_stripe_filter( let y = y + dy; if s_r1 > 0 { let integral_image_offset = integral_image_stride + 1; - sgrproj_box_ab_r1( + fn_ab_r1( &mut a_r1[(y + 2) % 3], &mut b_r1[(y + 2) % 3], &integral_image[integral_image_offset..], @@ -753,7 +760,6 @@ pub fn sgrproj_stripe_filter( y + 2, stripe_w, s_r1, - bdm8, fi.cpu_feature_level, ); let ap1: [&[u32]; 3] = @@ -841,8 +847,6 @@ pub fn sgrproj_solve( integral_image_buffer: &IntegralImageBuffer, input: &PlaneRegion<'_, T>, cdeffed: &PlaneSlice, cdef_w: usize, cdef_h: usize, ) -> (i8, i8) { - let bdm8 = fi.sequence.bit_depth - 8; - let mut a_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = [[0; IMAGE_WIDTH_MAX + 2]; 2]; let mut b_r2: [[u32; IMAGE_WIDTH_MAX + 2]; 2] = @@ -861,6 +865,19 @@ pub fn sgrproj_solve( let mut h: [[f64; 2]; 2] = [[0., 0.], [0., 0.]]; let mut c: [f64; 2] = [0., 0.]; + let fn_ab_r1 = match fi.sequence.bit_depth { + 8 => sgrproj_box_ab_r1::<8>, + 10 => sgrproj_box_ab_r1::<10>, + 12 => sgrproj_box_ab_r1::<12>, + _ => unimplemented!(), + }; + let fn_ab_r2 = match fi.sequence.bit_depth { + 8 => sgrproj_box_ab_r2::<8>, + 10 => sgrproj_box_ab_r2::<10>, + 12 => sgrproj_box_ab_r2::<12>, + _ => unimplemented!(), + }; + /* prime the intermediate arrays */ // One oddness about the radius=2 intermediate array computations that // the spec doesn't make clear: Although the spec defines computation @@ -869,7 +886,7 @@ pub fn sgrproj_solve( let integral_image = &integral_image_buffer.integral_image; let sq_integral_image = &integral_image_buffer.sq_integral_image; if s_r2 > 0 { - sgrproj_box_ab_r2( + fn_ab_r2( &mut a_r2[0], &mut b_r2[0], integral_image, @@ -878,13 +895,12 @@ pub fn sgrproj_solve( 0, cdef_w, s_r2, - bdm8, fi.cpu_feature_level, ); } if s_r1 > 0 { let integral_image_offset = SOLVE_IMAGE_STRIDE + 1; - sgrproj_box_ab_r1( + fn_ab_r1( &mut a_r1[0], &mut b_r1[0], &integral_image[integral_image_offset..], @@ -893,10 +909,9 @@ pub fn sgrproj_solve( 0, cdef_w, s_r1, - bdm8, fi.cpu_feature_level, ); - sgrproj_box_ab_r1( + fn_ab_r1( &mut a_r1[1], &mut b_r1[1], &integral_image[integral_image_offset..], @@ -905,7 +920,6 @@ pub fn sgrproj_solve( 1, cdef_w, s_r1, - bdm8, fi.cpu_feature_level, ); } @@ -916,7 +930,7 @@ pub fn sgrproj_solve( for y in (0..cdef_h).step_by(2) { // get results to use y and y+1 let f_r2_01: [&[u32]; 2] = if s_r2 > 0 { - sgrproj_box_ab_r2( + fn_ab_r2( &mut a_r2[(y / 2 + 1) % 2], &mut b_r2[(y / 2 + 1) % 2], integral_image, @@ -925,7 +939,6 @@ pub fn sgrproj_solve( y + 2, cdef_w, s_r2, - bdm8, fi.cpu_feature_level, ); let ap0: [&[u32]; 2] = [&a_r2[(y / 2) % 2], &a_r2[(y / 2 + 1) % 2]]; @@ -950,7 +963,7 @@ pub fn sgrproj_solve( let y = y + dy; if s_r1 > 0 { let integral_image_offset = SOLVE_IMAGE_STRIDE + 1; - sgrproj_box_ab_r1( + fn_ab_r1( &mut a_r1[(y + 2) % 3], &mut b_r1[(y + 2) % 3], &integral_image[integral_image_offset..], @@ -959,7 +972,6 @@ pub fn sgrproj_solve( y + 2, cdef_w, s_r1, - bdm8, fi.cpu_feature_level, ); let ap1: [&[u32]; 3] =