diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index dca7c9fe..d2a51c46 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -16,6 +16,13 @@ jobs: rust: ["1.34.2", stable, beta, nightly] features: ["", "rayon"] command: [test, benchmark] + include: + - rust: nightly + features: packed_simd + command: test + - rust: nightly + features: packed_simd + command: benchmark steps: - uses: actions/checkout@v2 - run: rustup default ${{ matrix.rust }} @@ -24,10 +31,12 @@ jobs: cargo build --verbose --no-default-features --features "$FEATURES" && cargo test --tests --benches --no-default-features --features "$FEATURES" if: ${{ matrix.command == 'test' }} + continue-on-error: ${{ matrix.rust == 'nightly' }} env: FEATURES: ${{ matrix.features }} - name: benchmark run: cargo bench --bench decoding_benchmark --no-default-features --features "$FEATURES" -- --warm-up-time 1 --measurement-time 1 --sample-size 25 if: ${{ matrix.command == 'benchmark' }} + continue-on-error: ${{ matrix.rust == 'nightly' }} env: FEATURES: ${{ matrix.features }} \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 7687eb9a..36887120 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,8 @@ exclude = ["tests/*"] [dependencies] byteorder = "1.0" rayon = { version = "1.0", optional = true } +simulated_packed_simd = "0.0.1" +packed_simd = { version = "0.3", optional = true } [dev-dependencies] png = "0.16" diff --git a/README.md b/README.md index 33524ae3..0da88531 100644 --- a/README.md +++ b/README.md @@ -31,5 +31,14 @@ fn main() { } ``` +## Performance + This crate uses [rayon](https://github.com/rayon-rs/rayon) to decode images on all available cores by default. + This can be disabled by requiring the dependency with `default-features = false`. + + This crate can optionally use [SIMD](https://en.wikipedia.org/wiki/SIMD) instructions + to decode images even faster. + This is not enabled by default because it requires a nightly compiler, + but can be activated with the `packed_simd` feature. + ## Requirements This crate compiles only with rust >= 1.34. diff --git a/src/idct.rs b/src/idct.rs index a5de0828..0f7242fa 100644 --- a/src/idct.rs +++ b/src/idct.rs @@ -4,6 +4,24 @@ use crate::parser::Dimensions; use std::num::Wrapping; +#[cfg(feature = "packed_simd")] +extern crate packed_simd; + +#[cfg(feature = "packed_simd")] +use self::packed_simd as simd; + +#[cfg(not(feature = "packed_simd"))] +extern crate simulated_packed_simd; + +#[cfg(not(feature = "packed_simd"))] +use self::simulated_packed_simd as simd; + +use self::simd::{ + i32x8, i16x8, u16x8, u8x8, + i32x4, i16x4, u16x4, u8x4, + FromCast, +}; + pub(crate) fn choose_idct_size(full_size: Dimensions, requested_size: Dimensions) -> usize { fn scaled(len: u16, scale: usize) -> u16 { ((len as u32 * scale as u32 - 1) / 8 + 1) as u16 } @@ -45,215 +63,248 @@ pub(crate) fn dequantize_and_idct_block(scale: usize, coefficients: &[i16], quan } } + +/// Transpose an 8x8 matrix represented by SIMD vectors +/// This has to be a macro not to depend on const generics +macro_rules! simd_transpose { + (i32x8, $s: expr) => {simd_transpose!(8, i32x8, $s)}; + (u8x8, $s: expr) => {simd_transpose!(8, u8x8, $s)}; + (i32x4, $s: expr) => {simd_transpose!(4, i32x4, $s)}; + (u8x4, $s: expr) => {simd_transpose!(4, u8x4, $s)}; + (4, $t:tt, $s: expr) => { + simd_transpose!([ + 0, 0,1,2,3; + 1, 0,1,2,3; + 2, 0,1,2,3; + 3, 0,1,2,3 + ], $t, $s) + }; + (8, $t:tt, $s: expr) => { + simd_transpose!([ + 0, 0,1,2,3,4,5,6,7; + 1, 0,1,2,3,4,5,6,7; + 2, 0,1,2,3,4,5,6,7; + 3, 0,1,2,3,4,5,6,7; + 4, 0,1,2,3,4,5,6,7; + 5, 0,1,2,3,4,5,6,7; + 6, 0,1,2,3,4,5,6,7; + 7, 0,1,2,3,4,5,6,7 + ], $t, $s) + }; + ([ $( $i:expr, $($j:expr),* );* ], $t: tt, $s: expr) => { + $s = [ + $( + $t::new( $( $s[$j].extract($i) ),* ) + ),* + ]; + } +} + +/// take a -128..127 value and clamp it and convert to 0..255 +macro_rules! stbi_clamp_simd { + ($source:tt, $target:tt, $x:expr) => ( + $target::from_cast( + $x.max($source::splat(0)) + .min($source::splat(255)))) +} + // This is based on stb_image's 'stbi__idct_block'. fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) { debug_assert_eq!(coefficients.len(), 64); - let mut temp = [Wrapping(0i32); 64]; + let coeff_vectors = [ + i32x8::from(i16x8::from_slice_unaligned(&coefficients[0..0 + 8])), + i32x8::from(i16x8::from_slice_unaligned(&coefficients[8..8 + 8])), + i32x8::from(i16x8::from_slice_unaligned(&coefficients[16..16 + 8])), + i32x8::from(i16x8::from_slice_unaligned(&coefficients[24..24 + 8])), + i32x8::from(i16x8::from_slice_unaligned(&coefficients[32..32 + 8])), + i32x8::from(i16x8::from_slice_unaligned(&coefficients[40..40 + 8])), + i32x8::from(i16x8::from_slice_unaligned(&coefficients[48..48 + 8])), + i32x8::from(i16x8::from_slice_unaligned(&coefficients[56..56 + 8])), + ]; + + let quant_vectors = [ + i32x8::from(u16x8::from_slice_unaligned(&quantization_table[0..0 + 8])), + i32x8::from(u16x8::from_slice_unaligned(&quantization_table[8..8 + 8])), + i32x8::from(u16x8::from_slice_unaligned(&quantization_table[16..16 + 8])), + i32x8::from(u16x8::from_slice_unaligned(&quantization_table[24..24 + 8])), + i32x8::from(u16x8::from_slice_unaligned(&quantization_table[32..32 + 8])), + i32x8::from(u16x8::from_slice_unaligned(&quantization_table[40..40 + 8])), + i32x8::from(u16x8::from_slice_unaligned(&quantization_table[48..48 + 8])), + i32x8::from(u16x8::from_slice_unaligned(&quantization_table[56..56 + 8])), + ]; + + let mut s: [i32x8; 8] = [ + coeff_vectors[0] * quant_vectors[0], + coeff_vectors[1] * quant_vectors[1], + coeff_vectors[2] * quant_vectors[2], + coeff_vectors[3] * quant_vectors[3], + coeff_vectors[4] * quant_vectors[4], + coeff_vectors[5] * quant_vectors[5], + coeff_vectors[6] * quant_vectors[6], + coeff_vectors[7] * quant_vectors[7], + ]; + + // constants scaled things up by 1<<12; let's bring them back + // down, but keep 2 extra bits of precision + let x = idct_1d_x(&s, 512); + let t = idct_1d_t(&s); + + s[0] = (x[0] + t[3]) >> 10; + s[1] = (x[1] + t[2]) >> 10; + s[2] = (x[2] + t[1]) >> 10; + s[3] = (x[3] + t[0]) >> 10; + s[4] = (x[3] - t[0]) >> 10; + s[5] = (x[2] - t[1]) >> 10; + s[6] = (x[1] - t[2]) >> 10; + s[7] = (x[0] - t[3]) >> 10; // columns - for i in 0 .. 8 { - // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing - if coefficients[i + 8] == 0 && coefficients[i + 16] == 0 && coefficients[i + 24] == 0 && - coefficients[i + 32] == 0 && coefficients[i + 40] == 0 && coefficients[i + 48] == 0 && - coefficients[i + 56] == 0 { - let dcterm = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32) << 2; - temp[i] = dcterm; - temp[i + 8] = dcterm; - temp[i + 16] = dcterm; - temp[i + 24] = dcterm; - temp[i + 32] = dcterm; - temp[i + 40] = dcterm; - temp[i + 48] = dcterm; - temp[i + 56] = dcterm; - } - else { - let s0 = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32); - let s1 = Wrapping(coefficients[i + 8] as i32 * quantization_table[i + 8] as i32); - let s2 = Wrapping(coefficients[i + 16] as i32 * quantization_table[i + 16] as i32); - let s3 = Wrapping(coefficients[i + 24] as i32 * quantization_table[i + 24] as i32); - let s4 = Wrapping(coefficients[i + 32] as i32 * quantization_table[i + 32] as i32); - let s5 = Wrapping(coefficients[i + 40] as i32 * quantization_table[i + 40] as i32); - let s6 = Wrapping(coefficients[i + 48] as i32 * quantization_table[i + 48] as i32); - let s7 = Wrapping(coefficients[i + 56] as i32 * quantization_table[i + 56] as i32); - - let p2 = s2; - let p3 = s6; - let p1 = (p2 + p3) * stbi_f2f(0.5411961); - let t2 = p1 + p3 * stbi_f2f(-1.847759065); - let t3 = p1 + p2 * stbi_f2f(0.765366865); - let p2 = s0; - let p3 = s4; - let t0 = stbi_fsh(p2 + p3); - let t1 = stbi_fsh(p2 - p3); - let x0 = t0 + t3; - let x3 = t0 - t3; - let x1 = t1 + t2; - let x2 = t1 - t2; - let t0 = s7; - let t1 = s5; - let t2 = s3; - let t3 = s1; - let p3 = t0 + t2; - let p4 = t1 + t3; - let p1 = t0 + t3; - let p2 = t1 + t2; - let p5 = (p3 + p4) * stbi_f2f(1.175875602); - let t0 = t0 * stbi_f2f(0.298631336); - let t1 = t1 * stbi_f2f(2.053119869); - let t2 = t2 * stbi_f2f(3.072711026); - let t3 = t3 * stbi_f2f(1.501321110); - let p1 = p5 + (p1 * stbi_f2f(-0.899976223)); - let p2 = p5 + (p2 * stbi_f2f(-2.562915447)); - let p3 = p3 * stbi_f2f(-1.961570560); - let p4 = p4 * stbi_f2f(-0.390180644); - let t3 = t3 + p1 + p4; - let t2 = t2 + p2 + p3; - let t1 = t1 + p2 + p4; - let t0 = t0 + p1 + p3; - - // constants scaled things up by 1<<12; let's bring them back - // down, but keep 2 extra bits of precision - let x0 = x0 + Wrapping(512); - let x1 = x1 + Wrapping(512); - let x2 = x2 + Wrapping(512); - let x3 = x3 + Wrapping(512); - - temp[i] = (x0 + t3) >> 10; - temp[i + 56] = (x0 - t3) >> 10; - temp[i + 8] = (x1 + t2) >> 10; - temp[i + 48] = (x1 - t2) >> 10; - temp[i + 16] = (x2 + t1) >> 10; - temp[i + 40] = (x2 - t1) >> 10; - temp[i + 24] = (x3 + t0) >> 10; - temp[i + 32] = (x3 - t0) >> 10; - } + simd_transpose!(i32x8, s); + + // constants scaled things up by 1<<12, plus we had 1<<2 from first + // loop, plus horizontal and vertical each scale by sqrt(8) so together + // we've got an extra 1<<3, so 1<<17 total we need to remove. + // so we want to round that, which means adding 0.5 * 1<<17, + // aka 65536. Also, we'll end up with -128 to 127 that we want + // to encode as 0..255 by adding 128, so we'll add that before the shift + let x = idct_1d_x(&s, 65536 + (128 << 17)); + let t = idct_1d_t(&s); + + let mut results = [ + stbi_clamp_simd!(i32x8,u8x8, (x[0] + t[3]) >> 17), + stbi_clamp_simd!(i32x8,u8x8, (x[1] + t[2]) >> 17), + stbi_clamp_simd!(i32x8,u8x8, (x[2] + t[1]) >> 17), + stbi_clamp_simd!(i32x8,u8x8, (x[3] + t[0]) >> 17), + stbi_clamp_simd!(i32x8,u8x8, (x[3] - t[0]) >> 17), + stbi_clamp_simd!(i32x8,u8x8, (x[2] - t[1]) >> 17), + stbi_clamp_simd!(i32x8,u8x8, (x[1] - t[2]) >> 17), + stbi_clamp_simd!(i32x8,u8x8, (x[0] - t[3]) >> 17), + ]; + + simd_transpose!(u8x8, results); + + for i in 0..8 { + let n = i * output_linestride; + results[i].write_to_slice_unaligned(&mut output[n..n + 8]); } +} - for i in 0 .. 8 { - // no fast case since the first 1D IDCT spread components out - let s0 = temp[i * 8]; - let s1 = temp[i * 8 + 1]; - let s2 = temp[i * 8 + 2]; - let s3 = temp[i * 8 + 3]; - let s4 = temp[i * 8 + 4]; - let s5 = temp[i * 8 + 5]; - let s6 = temp[i * 8 + 6]; - let s7 = temp[i * 8 + 7]; - - let p2 = s2; - let p3 = s6; - let p1 = (p2 + p3) * stbi_f2f(0.5411961); - let t2 = p1 + p3 * stbi_f2f(-1.847759065); - let t3 = p1 + p2 * stbi_f2f(0.765366865); - let p2 = s0; - let p3 = s4; - let t0 = stbi_fsh(p2 + p3); - let t1 = stbi_fsh(p2 - p3); - let x0 = t0 + t3; - let x3 = t0 - t3; - let x1 = t1 + t2; - let x2 = t1 - t2; - let t0 = s7; - let t1 = s5; - let t2 = s3; - let t3 = s1; - let p3 = t0 + t2; - let p4 = t1 + t3; - let p1 = t0 + t3; - let p2 = t1 + t2; - let p5 = (p3 + p4) * stbi_f2f(1.175875602); - let t0 = t0 * stbi_f2f(0.298631336); - let t1 = t1 * stbi_f2f(2.053119869); - let t2 = t2 * stbi_f2f(3.072711026); - let t3 = t3 * stbi_f2f(1.501321110); - let p1 = p5 + p1 * stbi_f2f(-0.899976223); - let p2 = p5 + p2 * stbi_f2f(-2.562915447); - let p3 = p3 * stbi_f2f(-1.961570560); - let p4 = p4 * stbi_f2f(-0.390180644); - let t3 = t3 + p1 + p4; - let t2 = t2 + p2 + p3; - let t1 = t1 + p2 + p4; - let t0 = t0 + p1 + p3; - - // constants scaled things up by 1<<12, plus we had 1<<2 from first - // loop, plus horizontal and vertical each scale by sqrt(8) so together - // we've got an extra 1<<3, so 1<<17 total we need to remove. - // so we want to round that, which means adding 0.5 * 1<<17, - // aka 65536. Also, we'll end up with -128 to 127 that we want - // to encode as 0..255 by adding 128, so we'll add that before the shift - let x0 = x0 + Wrapping(65536 + (128 << 17)); - let x1 = x1 + Wrapping(65536 + (128 << 17)); - let x2 = x2 + Wrapping(65536 + (128 << 17)); - let x3 = x3 + Wrapping(65536 + (128 << 17)); - - output[i * output_linestride] = stbi_clamp((x0 + t3) >> 17); - output[i * output_linestride + 7] = stbi_clamp((x0 - t3) >> 17); - output[i * output_linestride + 1] = stbi_clamp((x1 + t2) >> 17); - output[i * output_linestride + 6] = stbi_clamp((x1 - t2) >> 17); - output[i * output_linestride + 2] = stbi_clamp((x2 + t1) >> 17); - output[i * output_linestride + 5] = stbi_clamp((x2 - t1) >> 17); - output[i * output_linestride + 3] = stbi_clamp((x3 + t0) >> 17); - output[i * output_linestride + 4] = stbi_clamp((x3 - t0) >> 17); - } +#[inline(always)] +fn idct_1d_x(s: &[i32x8; 8], correction: i32) -> [i32x8; 4] { + let p2 = s[2]; + let p3 = s[6]; + let p1 = (p2 + p3) * i32x8::splat(stbi_f2f(0.5411961)); + let t2 = p1 + p3 * i32x8::splat(stbi_f2f(-1.847759065)); + let t3 = p1 + p2 * i32x8::splat(stbi_f2f(0.765366865)); + let p2 = s[0]; + let p3 = s[4]; + let t0 = stbi_fsh_simd(p2 + p3); + let t1 = stbi_fsh_simd(p2 - p3); + let correction = i32x8::splat(correction); + [ + t0 + t3 + correction, + t1 + t2 + correction, + t1 - t2 + correction, + t0 - t3 + correction, + ] +} + + +#[inline(always)] +fn idct_1d_t(s: &[i32x8; 8]) -> [i32x8; 4] { + let t0 = s[7]; + let t1 = s[5]; + let t2 = s[3]; + let t3 = s[1]; + let p3 = t0 + t2; + let p4 = t1 + t3; + let p1 = t0 + t3; + let p2 = t1 + t2; + let p5 = (p3 + p4) * i32x8::splat(stbi_f2f(1.175875602)); + let t0 = t0 * i32x8::splat(stbi_f2f(0.298631336)); + let t1 = t1 * i32x8::splat(stbi_f2f(2.053119869)); + let t2 = t2 * i32x8::splat(stbi_f2f(3.072711026)); + let t3 = t3 * i32x8::splat(stbi_f2f(1.501321110)); + let p1 = p5 + (p1 * i32x8::splat(stbi_f2f(-0.899976223))); + let p2 = p5 + (p2 * i32x8::splat(stbi_f2f(-2.562915447))); + let p3 = p3 * i32x8::splat(stbi_f2f(-1.961570560)); + let p4 = p4 * i32x8::splat(stbi_f2f(-0.390180644)); + [t0 + p1 + p3, t1 + p2 + p4, t2 + p2 + p3, t3 + p1 + p4, ] } // 4x4 and 2x2 IDCT based on Rakesh Dugad and Narendra Ahuja: "A Fast Scheme for Image Size Change in the Compressed Domain" (2001). // http://sylvana.net/jpegcrop/jidctred/ fn dequantize_and_idct_block_4x4(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) { debug_assert_eq!(coefficients.len(), 64); - let mut temp = [Wrapping(0i32); 4 * 4]; - const CONST_BITS: usize = 12; - const PASS1_BITS: usize = 2; - const FINAL_BITS: usize = CONST_BITS + PASS1_BITS + 3; + const CONST_BITS: u32 = 12; + const PASS1_BITS: u32 = 2; + const FINAL_BITS: u32 = CONST_BITS + PASS1_BITS + 3; + + let coeff_vectors = [ + i32x4::from(i16x4::from_slice_unaligned(&coefficients[0..0 + 4])), + i32x4::from(i16x4::from_slice_unaligned(&coefficients[8..8 + 4])), + i32x4::from(i16x4::from_slice_unaligned(&coefficients[16..16 + 4])), + i32x4::from(i16x4::from_slice_unaligned(&coefficients[24..24 + 4])), + ]; + + let quant_vectors = [ + i32x4::from(u16x4::from_slice_unaligned(&quantization_table[0..0 + 4])), + i32x4::from(u16x4::from_slice_unaligned(&quantization_table[8..8 + 4])), + i32x4::from(u16x4::from_slice_unaligned(&quantization_table[16..16 + 4])), + i32x4::from(u16x4::from_slice_unaligned(&quantization_table[24..24 + 4])), + ]; + + let mut s: [i32x4; 4] = [ + coeff_vectors[0] * quant_vectors[0], + coeff_vectors[1] * quant_vectors[1], + coeff_vectors[2] * quant_vectors[2], + coeff_vectors[3] * quant_vectors[3], + ]; + + let x0 = (s[0] + s[2]) << PASS1_BITS; + let x2 = (s[0] - s[2]) << PASS1_BITS; + let p1 = (s[1] + s[3]) * i32x4::splat(stbi_f2f(0.541196100)); + let t0 = (p1 + + s[3] * i32x4::splat(stbi_f2f(-1.847759065)) + + i32x4::splat(512) + ) >> (CONST_BITS - PASS1_BITS); + let t2 = (p1 + + s[1] * i32x4::splat(stbi_f2f(0.765366865)) + + i32x4::splat(512) + ) >> (CONST_BITS - PASS1_BITS); + + s[0] = x0 + t2; + s[1] = x2 + t0; + s[2] = x2 - t0; + s[3] = x0 - t2; // columns - for i in 0..4 { - let s0 = Wrapping(coefficients[i + 8 * 0] as i32 * quantization_table[i + 8 * 0] as i32); - let s1 = Wrapping(coefficients[i + 8 * 1] as i32 * quantization_table[i + 8 * 1] as i32); - let s2 = Wrapping(coefficients[i + 8 * 2] as i32 * quantization_table[i + 8 * 2] as i32); - let s3 = Wrapping(coefficients[i + 8 * 3] as i32 * quantization_table[i + 8 * 3] as i32); - - let x0 = (s0 + s2) << PASS1_BITS; - let x2 = (s0 - s2) << PASS1_BITS; - - let p1 = (s1 + s3) * stbi_f2f(0.541196100); - let t0 = (p1 + s3 * stbi_f2f(-1.847759065) + Wrapping(512)) >> (CONST_BITS - PASS1_BITS); - let t2 = (p1 + s1 * stbi_f2f(0.765366865) + Wrapping(512)) >> (CONST_BITS - PASS1_BITS); - - temp[i + 4 * 0] = x0 + t2; - temp[i + 4 * 3] = x0 - t2; - temp[i + 4 * 1] = x2 + t0; - temp[i + 4 * 2] = x2 - t0; - } + simd_transpose!(i32x4, s); - for i in 0 .. 4 { - let s0 = temp[i * 4 + 0]; - let s1 = temp[i * 4 + 1]; - let s2 = temp[i * 4 + 2]; - let s3 = temp[i * 4 + 3]; - - let x0 = (s0 + s2) << CONST_BITS; - let x2 = (s0 - s2) << CONST_BITS; - - let p1 = (s1 + s3) * stbi_f2f(0.541196100); - let t0 = p1 + s3 * stbi_f2f(-1.847759065); - let t2 = p1 + s1 * stbi_f2f(0.765366865); - - // constants scaled things up by 1<<12, plus we had 1<<2 from first - // loop, plus horizontal and vertical each scale by sqrt(8) so together - // we've got an extra 1<<3, so 1<<17 total we need to remove. - // so we want to round that, which means adding 0.5 * 1<<17, - // aka 65536. Also, we'll end up with -128 to 127 that we want - // to encode as 0..255 by adding 128, so we'll add that before the shift - let x0 = x0 + Wrapping(1 << (FINAL_BITS - 1)) + Wrapping(128 << FINAL_BITS); - let x2 = x2 + Wrapping(1 << (FINAL_BITS - 1)) + Wrapping(128 << FINAL_BITS); - - output[i * output_linestride + 0] = stbi_clamp((x0 + t2) >> FINAL_BITS); - output[i * output_linestride + 3] = stbi_clamp((x0 - t2) >> FINAL_BITS); - output[i * output_linestride + 1] = stbi_clamp((x2 + t0) >> FINAL_BITS); - output[i * output_linestride + 2] = stbi_clamp((x2 - t0) >> FINAL_BITS); + let x0 = (s[0] + s[2]) << CONST_BITS; + let x2 = (s[0] - s[2]) << CONST_BITS; + + let p1 = (s[1] + s[3]) * i32x4::splat(stbi_f2f(0.541196100)); + let t0 = p1 + s[3] * i32x4::splat(stbi_f2f(-1.847759065)); + let t2 = p1 + s[1] * i32x4::splat(stbi_f2f(0.765366865)); + + let x0 = x0 + i32x4::splat(1 << (FINAL_BITS - 1)) + i32x4::splat(128 << FINAL_BITS); + let x2 = x2 + i32x4::splat(1 << (FINAL_BITS - 1)) + i32x4::splat(128 << FINAL_BITS); + + let mut results = [ + stbi_clamp_simd!(i32x4,u8x4, (x0 + t2) >> FINAL_BITS), + stbi_clamp_simd!(i32x4,u8x4, (x2 + t0) >> FINAL_BITS), + stbi_clamp_simd!(i32x4,u8x4, (x2 - t0) >> FINAL_BITS), + stbi_clamp_simd!(i32x4,u8x4, (x0 - t2) >> FINAL_BITS), + ]; + + simd_transpose!(u8x4, results); + + for i in 0..4 { + let n = i * output_linestride; + results[i].write_to_slice_unaligned(&mut output[n..n + 4]); } } @@ -301,11 +352,11 @@ fn stbi_clamp(x: Wrapping<i32>) -> u8 x.0.max(0).min(255) as u8 } -fn stbi_f2f(x: f32) -> Wrapping<i32> { - Wrapping((x * 4096.0 + 0.5) as i32) +fn stbi_f2f(x: f32) -> i32 { + (x * 4096.0 + 0.5) as i32 } -fn stbi_fsh(x: Wrapping<i32>) -> Wrapping<i32> { +fn stbi_fsh_simd(x: i32x8) -> i32x8 { x << 12 } @@ -379,3 +430,4 @@ fn test_dequantize_and_idct_block_8x8_saturated() { 255, 255, 0, 255, 0, 255, 0, 0]; assert_eq!(&output[..], &expected[..]); } +