diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index dca7c9fe..d2a51c46 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -16,6 +16,13 @@ jobs:
         rust: ["1.34.2", stable, beta, nightly]
         features: ["", "rayon"]
         command: [test, benchmark]
+        include:
+          - rust: nightly
+            features: packed_simd
+            command: test
+          - rust: nightly
+            features: packed_simd
+            command: benchmark
     steps:
     - uses: actions/checkout@v2
     - run: rustup default ${{ matrix.rust }}
@@ -24,10 +31,12 @@ jobs:
         cargo build --verbose --no-default-features --features "$FEATURES" &&
         cargo test --tests --benches --no-default-features --features "$FEATURES"
       if: ${{ matrix.command == 'test' }}
+      continue-on-error: ${{ matrix.rust == 'nightly' }}
       env:
         FEATURES: ${{ matrix.features }}
     - name: benchmark
       run: cargo bench --bench decoding_benchmark --no-default-features --features "$FEATURES" -- --warm-up-time 1 --measurement-time 1 --sample-size 25
       if: ${{ matrix.command == 'benchmark' }}
+      continue-on-error: ${{ matrix.rust == 'nightly' }}
       env:
         FEATURES: ${{ matrix.features }}
\ No newline at end of file
diff --git a/Cargo.toml b/Cargo.toml
index 7687eb9a..36887120 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -13,6 +13,8 @@ exclude = ["tests/*"]
 [dependencies]
 byteorder = "1.0"
 rayon = { version = "1.0", optional = true }
+simulated_packed_simd = "0.0.1"
+packed_simd = { version = "0.3", optional = true }
 
 [dev-dependencies]
 png = "0.16"
diff --git a/README.md b/README.md
index 33524ae3..0da88531 100644
--- a/README.md
+++ b/README.md
@@ -31,5 +31,14 @@ fn main() {
 }
 ```
 
+## Performance
+ This crate uses [rayon](https://github.com/rayon-rs/rayon) to decode images on all available cores by default.
+ This can be disabled by requiring the dependency with `default-features = false`.
+ 
+ This crate can optionally use [SIMD](https://en.wikipedia.org/wiki/SIMD) instructions
+ to decode images even faster.
+ This is not enabled by default because it requires a nightly compiler,
+ but can be activated with the `packed_simd` feature.
+ 
 ## Requirements
 This crate compiles only with rust >= 1.34.
diff --git a/src/idct.rs b/src/idct.rs
index a5de0828..0f7242fa 100644
--- a/src/idct.rs
+++ b/src/idct.rs
@@ -4,6 +4,24 @@
 use crate::parser::Dimensions;
 use std::num::Wrapping;
 
+#[cfg(feature = "packed_simd")]
+extern crate packed_simd;
+
+#[cfg(feature = "packed_simd")]
+use self::packed_simd as simd;
+
+#[cfg(not(feature = "packed_simd"))]
+extern crate simulated_packed_simd;
+
+#[cfg(not(feature = "packed_simd"))]
+use self::simulated_packed_simd as simd;
+
+use self::simd::{
+    i32x8, i16x8, u16x8, u8x8,
+    i32x4, i16x4, u16x4, u8x4,
+    FromCast,
+};
+
 pub(crate) fn choose_idct_size(full_size: Dimensions, requested_size: Dimensions) -> usize {
     fn scaled(len: u16, scale: usize) -> u16 { ((len as u32 * scale as u32 - 1) / 8 + 1) as u16 }
 
@@ -45,215 +63,248 @@ pub(crate) fn dequantize_and_idct_block(scale: usize, coefficients: &[i16], quan
     }
 }
 
+
+/// Transpose an 8x8 matrix represented by SIMD vectors
+/// This has to be a macro not to depend on const generics
+macro_rules! simd_transpose {
+    (i32x8, $s: expr) => {simd_transpose!(8, i32x8, $s)};
+    (u8x8, $s: expr) => {simd_transpose!(8, u8x8, $s)};
+    (i32x4, $s: expr) => {simd_transpose!(4, i32x4, $s)};
+    (u8x4, $s: expr) => {simd_transpose!(4, u8x4, $s)};
+    (4, $t:tt, $s: expr) => {
+        simd_transpose!([
+            0, 0,1,2,3;
+            1, 0,1,2,3;
+            2, 0,1,2,3;
+            3, 0,1,2,3
+        ], $t, $s)
+    };
+    (8, $t:tt, $s: expr) => {
+        simd_transpose!([
+            0, 0,1,2,3,4,5,6,7;
+            1, 0,1,2,3,4,5,6,7;
+            2, 0,1,2,3,4,5,6,7;
+            3, 0,1,2,3,4,5,6,7;
+            4, 0,1,2,3,4,5,6,7;
+            5, 0,1,2,3,4,5,6,7;
+            6, 0,1,2,3,4,5,6,7;
+            7, 0,1,2,3,4,5,6,7
+        ], $t, $s)
+    };
+    ([  $( $i:expr, $($j:expr),* );*  ], $t: tt, $s: expr) => {
+        $s = [
+            $(
+                $t::new( $( $s[$j].extract($i) ),* )
+            ),*
+        ];
+    }
+}
+
+/// take a -128..127 value and clamp it and convert to 0..255
+macro_rules! stbi_clamp_simd {
+    ($source:tt, $target:tt, $x:expr) => (
+        $target::from_cast(
+            $x.max($source::splat(0))
+              .min($source::splat(255))))
+}
+
 // This is based on stb_image's 'stbi__idct_block'.
 fn dequantize_and_idct_block_8x8(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
     debug_assert_eq!(coefficients.len(), 64);
 
-    let mut temp = [Wrapping(0i32); 64];
+    let coeff_vectors = [
+        i32x8::from(i16x8::from_slice_unaligned(&coefficients[0..0 + 8])),
+        i32x8::from(i16x8::from_slice_unaligned(&coefficients[8..8 + 8])),
+        i32x8::from(i16x8::from_slice_unaligned(&coefficients[16..16 + 8])),
+        i32x8::from(i16x8::from_slice_unaligned(&coefficients[24..24 + 8])),
+        i32x8::from(i16x8::from_slice_unaligned(&coefficients[32..32 + 8])),
+        i32x8::from(i16x8::from_slice_unaligned(&coefficients[40..40 + 8])),
+        i32x8::from(i16x8::from_slice_unaligned(&coefficients[48..48 + 8])),
+        i32x8::from(i16x8::from_slice_unaligned(&coefficients[56..56 + 8])),
+    ];
+
+    let quant_vectors = [
+        i32x8::from(u16x8::from_slice_unaligned(&quantization_table[0..0 + 8])),
+        i32x8::from(u16x8::from_slice_unaligned(&quantization_table[8..8 + 8])),
+        i32x8::from(u16x8::from_slice_unaligned(&quantization_table[16..16 + 8])),
+        i32x8::from(u16x8::from_slice_unaligned(&quantization_table[24..24 + 8])),
+        i32x8::from(u16x8::from_slice_unaligned(&quantization_table[32..32 + 8])),
+        i32x8::from(u16x8::from_slice_unaligned(&quantization_table[40..40 + 8])),
+        i32x8::from(u16x8::from_slice_unaligned(&quantization_table[48..48 + 8])),
+        i32x8::from(u16x8::from_slice_unaligned(&quantization_table[56..56 + 8])),
+    ];
+
+    let mut s: [i32x8; 8] = [
+        coeff_vectors[0] * quant_vectors[0],
+        coeff_vectors[1] * quant_vectors[1],
+        coeff_vectors[2] * quant_vectors[2],
+        coeff_vectors[3] * quant_vectors[3],
+        coeff_vectors[4] * quant_vectors[4],
+        coeff_vectors[5] * quant_vectors[5],
+        coeff_vectors[6] * quant_vectors[6],
+        coeff_vectors[7] * quant_vectors[7],
+    ];
+
+    // constants scaled things up by 1<<12; let's bring them back
+    // down, but keep 2 extra bits of precision
+    let x = idct_1d_x(&s, 512);
+    let t = idct_1d_t(&s);
+
+    s[0] = (x[0] + t[3]) >> 10;
+    s[1] = (x[1] + t[2]) >> 10;
+    s[2] = (x[2] + t[1]) >> 10;
+    s[3] = (x[3] + t[0]) >> 10;
+    s[4] = (x[3] - t[0]) >> 10;
+    s[5] = (x[2] - t[1]) >> 10;
+    s[6] = (x[1] - t[2]) >> 10;
+    s[7] = (x[0] - t[3]) >> 10;
 
     // columns
-    for i in 0 .. 8 {
-        // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
-        if coefficients[i + 8] == 0 && coefficients[i + 16] == 0 && coefficients[i + 24] == 0 &&
-                coefficients[i + 32] == 0 && coefficients[i + 40] == 0 && coefficients[i + 48] == 0 &&
-                coefficients[i + 56] == 0 {
-            let dcterm = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32) << 2;
-            temp[i]      = dcterm;
-            temp[i + 8]  = dcterm;
-            temp[i + 16] = dcterm;
-            temp[i + 24] = dcterm;
-            temp[i + 32] = dcterm;
-            temp[i + 40] = dcterm;
-            temp[i + 48] = dcterm;
-            temp[i + 56] = dcterm;
-        }
-        else {
-            let s0 = Wrapping(coefficients[i] as i32 * quantization_table[i] as i32);
-            let s1 = Wrapping(coefficients[i + 8] as i32 * quantization_table[i + 8] as i32);
-            let s2 = Wrapping(coefficients[i + 16] as i32 * quantization_table[i + 16] as i32);
-            let s3 = Wrapping(coefficients[i + 24] as i32 * quantization_table[i + 24] as i32);
-            let s4 = Wrapping(coefficients[i + 32] as i32 * quantization_table[i + 32] as i32);
-            let s5 = Wrapping(coefficients[i + 40] as i32 * quantization_table[i + 40] as i32);
-            let s6 = Wrapping(coefficients[i + 48] as i32 * quantization_table[i + 48] as i32);
-            let s7 = Wrapping(coefficients[i + 56] as i32 * quantization_table[i + 56] as i32);
-
-            let p2 = s2;
-            let p3 = s6;
-            let p1 = (p2 + p3) * stbi_f2f(0.5411961);
-            let t2 = p1 + p3 * stbi_f2f(-1.847759065);
-            let t3 = p1 + p2 * stbi_f2f(0.765366865);
-            let p2 = s0;
-            let p3 = s4;
-            let t0 = stbi_fsh(p2 + p3);
-            let t1 = stbi_fsh(p2 - p3);
-            let x0 = t0 + t3;
-            let x3 = t0 - t3;
-            let x1 = t1 + t2;
-            let x2 = t1 - t2;
-            let t0 = s7;
-            let t1 = s5;
-            let t2 = s3;
-            let t3 = s1;
-            let p3 = t0 + t2;
-            let p4 = t1 + t3;
-            let p1 = t0 + t3;
-            let p2 = t1 + t2;
-            let p5 = (p3 + p4) * stbi_f2f(1.175875602);
-            let t0 = t0 * stbi_f2f(0.298631336);
-            let t1 = t1 * stbi_f2f(2.053119869);
-            let t2 = t2 * stbi_f2f(3.072711026);
-            let t3 = t3 * stbi_f2f(1.501321110);
-            let p1 = p5 + (p1 * stbi_f2f(-0.899976223));
-            let p2 = p5 + (p2 * stbi_f2f(-2.562915447));
-            let p3 = p3 * stbi_f2f(-1.961570560);
-            let p4 = p4 * stbi_f2f(-0.390180644);
-            let t3 = t3 + p1 + p4;
-            let t2 = t2 + p2 + p3;
-            let t1 = t1 + p2 + p4;
-            let t0 = t0 + p1 + p3;
-
-            // constants scaled things up by 1<<12; let's bring them back
-            // down, but keep 2 extra bits of precision
-            let x0 = x0 + Wrapping(512);
-            let x1 = x1 + Wrapping(512);
-            let x2 = x2 + Wrapping(512);
-            let x3 = x3 + Wrapping(512);
-
-            temp[i] = (x0 + t3) >> 10;
-            temp[i + 56] = (x0 - t3) >> 10;
-            temp[i + 8] = (x1 + t2) >> 10;
-            temp[i + 48] = (x1 - t2) >> 10;
-            temp[i + 16] = (x2 + t1) >> 10;
-            temp[i + 40] = (x2 - t1) >> 10;
-            temp[i + 24] = (x3 + t0) >> 10;
-            temp[i + 32] = (x3 - t0) >> 10;
-        }
+    simd_transpose!(i32x8, s);
+
+    // constants scaled things up by 1<<12, plus we had 1<<2 from first
+    // loop, plus horizontal and vertical each scale by sqrt(8) so together
+    // we've got an extra 1<<3, so 1<<17 total we need to remove.
+    // so we want to round that, which means adding 0.5 * 1<<17,
+    // aka 65536. Also, we'll end up with -128 to 127 that we want
+    // to encode as 0..255 by adding 128, so we'll add that before the shift
+    let x = idct_1d_x(&s, 65536 + (128 << 17));
+    let t = idct_1d_t(&s);
+
+    let mut results = [
+        stbi_clamp_simd!(i32x8,u8x8, (x[0] + t[3]) >> 17),
+        stbi_clamp_simd!(i32x8,u8x8, (x[1] + t[2]) >> 17),
+        stbi_clamp_simd!(i32x8,u8x8, (x[2] + t[1]) >> 17),
+        stbi_clamp_simd!(i32x8,u8x8, (x[3] + t[0]) >> 17),
+        stbi_clamp_simd!(i32x8,u8x8, (x[3] - t[0]) >> 17),
+        stbi_clamp_simd!(i32x8,u8x8, (x[2] - t[1]) >> 17),
+        stbi_clamp_simd!(i32x8,u8x8, (x[1] - t[2]) >> 17),
+        stbi_clamp_simd!(i32x8,u8x8, (x[0] - t[3]) >> 17),
+    ];
+
+    simd_transpose!(u8x8, results);
+    
+    for i in 0..8 {
+        let n = i * output_linestride;
+        results[i].write_to_slice_unaligned(&mut output[n..n + 8]);
     }
+}
 
-    for i in 0 .. 8 {
-        // no fast case since the first 1D IDCT spread components out
-        let s0 = temp[i * 8];
-        let s1 = temp[i * 8 + 1];
-        let s2 = temp[i * 8 + 2];
-        let s3 = temp[i * 8 + 3];
-        let s4 = temp[i * 8 + 4];
-        let s5 = temp[i * 8 + 5];
-        let s6 = temp[i * 8 + 6];
-        let s7 = temp[i * 8 + 7];
-
-        let p2 = s2;
-        let p3 = s6;
-        let p1 = (p2 + p3) * stbi_f2f(0.5411961);
-        let t2 = p1 + p3 * stbi_f2f(-1.847759065);
-        let t3 = p1 + p2 * stbi_f2f(0.765366865);
-        let p2 = s0;
-        let p3 = s4;
-        let t0 = stbi_fsh(p2 + p3);
-        let t1 = stbi_fsh(p2 - p3);
-        let x0 = t0 + t3;
-        let x3 = t0 - t3;
-        let x1 = t1 + t2;
-        let x2 = t1 - t2;
-        let t0 = s7;
-        let t1 = s5;
-        let t2 = s3;
-        let t3 = s1;
-        let p3 = t0 + t2;
-        let p4 = t1 + t3;
-        let p1 = t0 + t3;
-        let p2 = t1 + t2;
-        let p5 = (p3 + p4) * stbi_f2f(1.175875602);
-        let t0 = t0 * stbi_f2f(0.298631336);
-        let t1 = t1 * stbi_f2f(2.053119869);
-        let t2 = t2 * stbi_f2f(3.072711026);
-        let t3 = t3 * stbi_f2f(1.501321110);
-        let p1 = p5 + p1 * stbi_f2f(-0.899976223);
-        let p2 = p5 + p2 * stbi_f2f(-2.562915447);
-        let p3 = p3 * stbi_f2f(-1.961570560);
-        let p4 = p4 * stbi_f2f(-0.390180644);
-        let t3 = t3 + p1 + p4;
-        let t2 = t2 + p2 + p3;
-        let t1 = t1 + p2 + p4;
-        let t0 = t0 + p1 + p3;
-
-        // constants scaled things up by 1<<12, plus we had 1<<2 from first
-        // loop, plus horizontal and vertical each scale by sqrt(8) so together
-        // we've got an extra 1<<3, so 1<<17 total we need to remove.
-        // so we want to round that, which means adding 0.5 * 1<<17,
-        // aka 65536. Also, we'll end up with -128 to 127 that we want
-        // to encode as 0..255 by adding 128, so we'll add that before the shift
-        let x0 = x0 + Wrapping(65536 + (128 << 17));
-        let x1 = x1 + Wrapping(65536 + (128 << 17));
-        let x2 = x2 + Wrapping(65536 + (128 << 17));
-        let x3 = x3 + Wrapping(65536 + (128 << 17));
-
-        output[i * output_linestride] = stbi_clamp((x0 + t3) >> 17);
-        output[i * output_linestride + 7] = stbi_clamp((x0 - t3) >> 17);
-        output[i * output_linestride + 1] = stbi_clamp((x1 + t2) >> 17);
-        output[i * output_linestride + 6] = stbi_clamp((x1 - t2) >> 17);
-        output[i * output_linestride + 2] = stbi_clamp((x2 + t1) >> 17);
-        output[i * output_linestride + 5] = stbi_clamp((x2 - t1) >> 17);
-        output[i * output_linestride + 3] = stbi_clamp((x3 + t0) >> 17);
-        output[i * output_linestride + 4] = stbi_clamp((x3 - t0) >> 17);
-    }
+#[inline(always)]
+fn idct_1d_x(s: &[i32x8; 8], correction: i32) -> [i32x8; 4] {
+    let p2 = s[2];
+    let p3 = s[6];
+    let p1 = (p2 + p3) * i32x8::splat(stbi_f2f(0.5411961));
+    let t2 = p1 + p3 * i32x8::splat(stbi_f2f(-1.847759065));
+    let t3 = p1 + p2 * i32x8::splat(stbi_f2f(0.765366865));
+    let p2 = s[0];
+    let p3 = s[4];
+    let t0 = stbi_fsh_simd(p2 + p3);
+    let t1 = stbi_fsh_simd(p2 - p3);
+    let correction = i32x8::splat(correction);
+    [
+        t0 + t3 + correction,
+        t1 + t2 + correction,
+        t1 - t2 + correction,
+        t0 - t3 + correction,
+    ]
+}
+
+
+#[inline(always)]
+fn idct_1d_t(s: &[i32x8; 8]) -> [i32x8; 4] {
+    let t0 = s[7];
+    let t1 = s[5];
+    let t2 = s[3];
+    let t3 = s[1];
+    let p3 = t0 + t2;
+    let p4 = t1 + t3;
+    let p1 = t0 + t3;
+    let p2 = t1 + t2;
+    let p5 = (p3 + p4) * i32x8::splat(stbi_f2f(1.175875602));
+    let t0 = t0 * i32x8::splat(stbi_f2f(0.298631336));
+    let t1 = t1 * i32x8::splat(stbi_f2f(2.053119869));
+    let t2 = t2 * i32x8::splat(stbi_f2f(3.072711026));
+    let t3 = t3 * i32x8::splat(stbi_f2f(1.501321110));
+    let p1 = p5 + (p1 * i32x8::splat(stbi_f2f(-0.899976223)));
+    let p2 = p5 + (p2 * i32x8::splat(stbi_f2f(-2.562915447)));
+    let p3 = p3 * i32x8::splat(stbi_f2f(-1.961570560));
+    let p4 = p4 * i32x8::splat(stbi_f2f(-0.390180644));
+    [t0 + p1 + p3, t1 + p2 + p4, t2 + p2 + p3, t3 + p1 + p4, ]
 }
 
 // 4x4 and 2x2 IDCT based on Rakesh Dugad and Narendra Ahuja: "A Fast Scheme for Image Size Change in the Compressed Domain" (2001).
 // http://sylvana.net/jpegcrop/jidctred/
 fn dequantize_and_idct_block_4x4(coefficients: &[i16], quantization_table: &[u16; 64], output_linestride: usize, output: &mut [u8]) {
     debug_assert_eq!(coefficients.len(), 64);
-    let mut temp = [Wrapping(0i32); 4 * 4];
 
-    const CONST_BITS: usize = 12;
-    const PASS1_BITS: usize = 2;
-    const FINAL_BITS: usize = CONST_BITS + PASS1_BITS + 3;
+    const CONST_BITS: u32 = 12;
+    const PASS1_BITS: u32 = 2;
+    const FINAL_BITS: u32 = CONST_BITS + PASS1_BITS + 3;
+
+    let coeff_vectors = [
+        i32x4::from(i16x4::from_slice_unaligned(&coefficients[0..0 + 4])),
+        i32x4::from(i16x4::from_slice_unaligned(&coefficients[8..8 + 4])),
+        i32x4::from(i16x4::from_slice_unaligned(&coefficients[16..16 + 4])),
+        i32x4::from(i16x4::from_slice_unaligned(&coefficients[24..24 + 4])),
+    ];
+
+    let quant_vectors = [
+        i32x4::from(u16x4::from_slice_unaligned(&quantization_table[0..0 + 4])),
+        i32x4::from(u16x4::from_slice_unaligned(&quantization_table[8..8 + 4])),
+        i32x4::from(u16x4::from_slice_unaligned(&quantization_table[16..16 + 4])),
+        i32x4::from(u16x4::from_slice_unaligned(&quantization_table[24..24 + 4])),
+    ];
+
+    let mut s: [i32x4; 4] = [
+        coeff_vectors[0] * quant_vectors[0],
+        coeff_vectors[1] * quant_vectors[1],
+        coeff_vectors[2] * quant_vectors[2],
+        coeff_vectors[3] * quant_vectors[3],
+    ];
+
+    let x0 = (s[0] + s[2]) << PASS1_BITS;
+    let x2 = (s[0] - s[2]) << PASS1_BITS;
+    let p1 = (s[1] + s[3]) * i32x4::splat(stbi_f2f(0.541196100));
+    let t0 = (p1
+        + s[3] * i32x4::splat(stbi_f2f(-1.847759065))
+        + i32x4::splat(512)
+    ) >> (CONST_BITS - PASS1_BITS);
+    let t2 = (p1
+        + s[1] * i32x4::splat(stbi_f2f(0.765366865))
+        + i32x4::splat(512)
+    ) >> (CONST_BITS - PASS1_BITS);
+
+    s[0] = x0 + t2;
+    s[1] = x2 + t0;
+    s[2] = x2 - t0;
+    s[3] = x0 - t2;
 
     // columns
-    for i in 0..4 {
-        let s0 = Wrapping(coefficients[i + 8 * 0] as i32 * quantization_table[i + 8 * 0] as i32);
-        let s1 = Wrapping(coefficients[i + 8 * 1] as i32 * quantization_table[i + 8 * 1] as i32);
-        let s2 = Wrapping(coefficients[i + 8 * 2] as i32 * quantization_table[i + 8 * 2] as i32);
-        let s3 = Wrapping(coefficients[i + 8 * 3] as i32 * quantization_table[i + 8 * 3] as i32);
-
-        let x0 = (s0 + s2) << PASS1_BITS;
-        let x2 = (s0 - s2) << PASS1_BITS;
-
-        let p1 = (s1 + s3) * stbi_f2f(0.541196100);
-        let t0 = (p1 + s3 * stbi_f2f(-1.847759065) + Wrapping(512)) >> (CONST_BITS - PASS1_BITS);
-        let t2 = (p1 + s1 * stbi_f2f(0.765366865) + Wrapping(512)) >> (CONST_BITS - PASS1_BITS);
-
-        temp[i + 4 * 0] = x0 + t2;
-        temp[i + 4 * 3] = x0 - t2;
-        temp[i + 4 * 1] = x2 + t0;
-        temp[i + 4 * 2] = x2 - t0;
-    }
+    simd_transpose!(i32x4, s);
 
-    for i in 0 .. 4 {
-        let s0 = temp[i * 4 + 0];
-        let s1 = temp[i * 4 + 1];
-        let s2 = temp[i * 4 + 2];
-        let s3 = temp[i * 4 + 3];
-
-        let x0 = (s0 + s2) << CONST_BITS;
-        let x2 = (s0 - s2) << CONST_BITS;
-
-        let p1 = (s1 + s3) * stbi_f2f(0.541196100);
-        let t0 = p1 + s3 * stbi_f2f(-1.847759065);
-        let t2 = p1 + s1 * stbi_f2f(0.765366865);
-
-        // constants scaled things up by 1<<12, plus we had 1<<2 from first
-        // loop, plus horizontal and vertical each scale by sqrt(8) so together
-        // we've got an extra 1<<3, so 1<<17 total we need to remove.
-        // so we want to round that, which means adding 0.5 * 1<<17,
-        // aka 65536. Also, we'll end up with -128 to 127 that we want
-        // to encode as 0..255 by adding 128, so we'll add that before the shift
-        let x0 = x0 + Wrapping(1 << (FINAL_BITS - 1)) + Wrapping(128 << FINAL_BITS);
-        let x2 = x2 + Wrapping(1 << (FINAL_BITS - 1)) + Wrapping(128 << FINAL_BITS);
-
-        output[i * output_linestride + 0] = stbi_clamp((x0 + t2) >> FINAL_BITS);
-        output[i * output_linestride + 3] = stbi_clamp((x0 - t2) >> FINAL_BITS);
-        output[i * output_linestride + 1] = stbi_clamp((x2 + t0) >> FINAL_BITS);
-        output[i * output_linestride + 2] = stbi_clamp((x2 - t0) >> FINAL_BITS);
+    let x0 = (s[0] + s[2]) << CONST_BITS;
+    let x2 = (s[0] - s[2]) << CONST_BITS;
+
+    let p1 = (s[1] + s[3]) * i32x4::splat(stbi_f2f(0.541196100));
+    let t0 = p1 + s[3] * i32x4::splat(stbi_f2f(-1.847759065));
+    let t2 = p1 + s[1] * i32x4::splat(stbi_f2f(0.765366865));
+
+    let x0 = x0 + i32x4::splat(1 << (FINAL_BITS - 1)) + i32x4::splat(128 << FINAL_BITS);
+    let x2 = x2 + i32x4::splat(1 << (FINAL_BITS - 1)) + i32x4::splat(128 << FINAL_BITS);
+
+    let mut results = [
+        stbi_clamp_simd!(i32x4,u8x4, (x0 + t2) >> FINAL_BITS),
+        stbi_clamp_simd!(i32x4,u8x4, (x2 + t0) >> FINAL_BITS),
+        stbi_clamp_simd!(i32x4,u8x4, (x2 - t0) >> FINAL_BITS),
+        stbi_clamp_simd!(i32x4,u8x4, (x0 - t2) >> FINAL_BITS),
+    ];
+
+    simd_transpose!(u8x4, results);
+
+    for i in 0..4 {
+        let n = i * output_linestride;
+        results[i].write_to_slice_unaligned(&mut output[n..n + 4]);
     }
 }
 
@@ -301,11 +352,11 @@ fn stbi_clamp(x: Wrapping<i32>) -> u8
     x.0.max(0).min(255) as u8
 }
 
-fn stbi_f2f(x: f32) -> Wrapping<i32> {
-    Wrapping((x * 4096.0 + 0.5) as i32)
+fn stbi_f2f(x: f32) -> i32 {
+    (x * 4096.0 + 0.5) as i32
 }
 
-fn stbi_fsh(x: Wrapping<i32>) -> Wrapping<i32> {
+fn stbi_fsh_simd(x: i32x8) -> i32x8 {
     x << 12
 }
 
@@ -379,3 +430,4 @@ fn test_dequantize_and_idct_block_8x8_saturated() {
         255, 255, 0, 255, 0, 255, 0, 0];
     assert_eq!(&output[..], &expected[..]);
 }
+