diff --git a/benches/softmax.rs b/benches/softmax.rs
new file mode 100644
index 0000000..6ddb0be
--- /dev/null
+++ b/benches/softmax.rs
@@ -0,0 +1,86 @@
+#![feature(test)]
+#![feature(clone_from_slice)]
+
+extern crate test;
+extern crate collenchyma as co;
+extern crate collenchyma_nn as co_nn;
+extern crate rand;
+
+use test::Bencher;
+use co::backend::{Backend, BackendConfig};
+use co::frameworks::Native;
+use co::framework::IFramework;
+use co::tensor::SharedTensor;
+use co_nn::*;
+
+use rand::{thread_rng, Rng};
+
+fn backend() -> Backend<Native> {
+    let framework = Native::new();
+    let hardwares = framework.hardwares();
+    let backend_config = BackendConfig::new(framework, hardwares);
+    Backend::new(backend_config).unwrap()
+}
+
+fn arguments<T: IFramework + Clone>(backend: &Backend<T>, size: usize) -> (SharedTensor<f32>, SharedTensor<f32>) {
+    let mut rng = thread_rng();
+    let slice_x = rng.gen_iter::<f32>().take(size).collect::<Vec<f32>>();
+
+    let mut x = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
+    let out = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
+    x.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
+    (x, out)
+}
+
+fn arguments_grad<T: IFramework + Clone>(backend: &Backend<T>, size: usize) -> (SharedTensor<f32>, SharedTensor<f32>, SharedTensor<f32>) {
+    let mut rng = thread_rng();
+    let slice_x = rng.gen_iter::<f32>().take(size).collect::<Vec<f32>>();
+
+    let mut x = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
+    let mut dx = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
+    let dout = SharedTensor::<f32>::new(backend.device(), &size).unwrap();
+    x.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
+    dx.get_mut(backend.device()).unwrap().as_mut_native().unwrap().as_mut_slice().clone_from_slice(&slice_x);
+    (x, dx, dout)
+}
+
+#[inline(never)]
+fn bench_profile<F: FnMut() -> ()>(
+    b: &mut Bencher,
+    mut bench_func: F,
+    times: usize
+) {
+    b.iter(|| { for _ in 0..times { bench_func(); } });
+}
+
+#[bench]
+fn bench_1000_softmax_100_native(b: &mut Bencher) {
+    let backend = backend();
+    let (mut x, mut out) = arguments(&backend, 100);
+    let mut func = || { let _ = backend.softmax_plain(&mut x, &mut out); };
+    { func(); bench_profile(b, func, 1000); }
+}
+
+#[bench]
+fn bench_10_softmax_10000_native(b: &mut Bencher) {
+    let backend = backend();
+    let (mut x, mut out) = arguments(&backend, 10000);
+    let mut func = || { let _ = backend.softmax_plain(&mut x, &mut out); };
+    { func(); bench_profile(b, func, 10); }
+}
+
+#[bench]
+fn bench_1000_softmax_grad_100_native(b: &mut Bencher) {
+    let backend = backend();
+    let (mut x, mut dx, mut dout) = arguments_grad(&backend, 100);
+    let mut func = || { let _ = backend.softmax_grad_plain(&mut x, &mut dx, &mut dout); };
+    { func(); bench_profile(b, func, 1000); }
+}
+
+#[bench]
+fn bench_10_softmax_grad_10000_native(b: &mut Bencher) {
+    let backend = backend();
+    let (mut x, mut dx, mut dout) = arguments_grad(&backend, 10000);
+    let mut func = || { let _ = backend.softmax_grad_plain(&mut x, &mut dx, &mut dout); };
+    { func(); bench_profile(b, func, 10); }
+}
diff --git a/src/frameworks/native/helper.rs b/src/frameworks/native/helper.rs
index 399a5d1..ba35f2d 100644
--- a/src/frameworks/native/helper.rs
+++ b/src/frameworks/native/helper.rs
@@ -130,8 +130,8 @@ macro_rules! impl_ops_sigmoid_for {
                 result_diff: &mut ::co::tensor::SharedTensor<$t>
             ) -> Result<(), ::co::error::Error> {
                 match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(x.sync(self.device())) }
+                match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
+                match result.add_device(self.device()) { _ => try!(result.sync(self.device())) }
                 match result_diff.add_device(self.device()) { _ => () }
                 self.sigmoid_grad_plain(x, x_diff, result, result_diff)
             }
@@ -193,8 +193,8 @@ macro_rules! impl_ops_relu_for {
                 result_diff: &mut ::co::tensor::SharedTensor<$t>
             ) -> Result<(), ::co::error::Error> {
                 match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(x.sync(self.device())) }
+                match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
+                match result.add_device(self.device()) { _ => try!(result.sync(self.device())) }
                 self.relu_grad_plain(x, x_diff, result, result_diff)
             }
 
@@ -256,8 +256,8 @@ macro_rules! impl_ops_tanh_for {
                 result_diff: &mut ::co::tensor::SharedTensor<$t>
             ) -> Result<(), ::co::error::Error> {
                 match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match x_diff.add_device(self.device()) { _ => try!(x.sync(self.device())) }
-                match result.add_device(self.device()) { _ => try!(x.sync(self.device())) }
+                match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
+                match result.add_device(self.device()) { _ => try!(result.sync(self.device())) }
                 self.tanh_grad_plain(x, x_diff, result, result_diff)
             }
 
@@ -354,16 +354,28 @@ macro_rules! impl_ops_softmax_for {
                 x: &mut ::co::tensor::SharedTensor<$t>,
                 result: &mut ::co::tensor::SharedTensor<$t>
             ) -> Result<(), ::co::error::Error> {
-                unimplemented!();
-                Ok(())
+                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
+                match result.add_device(self.device()) { _ => () }
+                self.softmax_plain(x, result)
             }
             fn softmax_plain(
                 &self,
                 x: &::co::tensor::SharedTensor<$t>,
                 result: &mut ::co::tensor::SharedTensor<$t>
             ) -> Result<(), ::co::error::Error> {
-                unimplemented!();
-                Ok(())
+                if let Some(input) = x.get(self.device()).unwrap().as_native() {
+                    let mut exps = Vec::with_capacity(x.capacity());
+                    let mut sum : $t = 0 as $t;
+                    for exp in input.as_slice::<$t>().iter().map(|t|t.exp()) {
+                        exps.push(exp);
+                        sum += exp;
+                    }
+                    let res = exps.iter().map(|t| t / sum);
+                    ::frameworks::native::helper::write_to_memory(result.get_mut(self.device()).unwrap(), res);
+                    return Ok(());
+                }
+                Err(Error::Plugin(
+                    PluginError::Operation("Unable to execute Native softmax Forward.")))
             }
             fn softmax_grad(
                 &self,
@@ -371,8 +383,10 @@ macro_rules! impl_ops_softmax_for {
                 x_diff: &mut ::co::tensor::SharedTensor<$t>,
                 result_diff: &mut ::co::tensor::SharedTensor<$t>
             ) -> Result<(), ::co::error::Error> {
-                unimplemented!();
-                Ok(())
+                match x.add_device(self.device()) { _ => try!(x.sync(self.device())) }
+                match x_diff.add_device(self.device()) { _ => try!(x_diff.sync(self.device())) }
+                match result_diff.add_device(self.device()) { _ => () }
+                self.softmax_grad_plain(x, x_diff, result_diff)
             }
             fn softmax_grad_plain(
                 &self,
@@ -380,8 +394,24 @@ macro_rules! impl_ops_softmax_for {
                 x_diff: &::co::tensor::SharedTensor<$t>,
                 result_diff: &mut ::co::tensor::SharedTensor<$t>
             ) -> Result<(), ::co::error::Error> {
-                unimplemented!();
-                Ok(())
+                if let Some(sig_data) = x.get(self.device()).unwrap().as_native() {
+                    if let Some(sig_dx) = x_diff.get(self.device()).unwrap().as_native() {
+                        let mut dot : $t = 0 as $t;
+                        let sig_data_slice = sig_data.as_slice::<$t>();
+                        let sig_dx_slice = sig_dx.as_slice::<$t>();
+                        for (t, dt) in sig_data_slice.iter().zip(sig_dx_slice.iter()) {
+                            dot += t * dt;
+                        }
+                        let res = sig_data_slice.iter()
+                            .zip(sig_dx_slice.iter())
+                            .map(|(t, dt)| t * (dt - dot));
+                        ::frameworks::native::helper::write_to_memory(result_diff.get_mut(self.device()).unwrap(), res);
+                        return Ok(());
+                    }
+                }
+                Err(Error::Plugin(
+                        PluginError::Operation("Unable to execute Native softmax Backward.")))
+
             }
         }
     );
diff --git a/src/frameworks/native/mod.rs b/src/frameworks/native/mod.rs
index c70b967..80d34ef 100644
--- a/src/frameworks/native/mod.rs
+++ b/src/frameworks/native/mod.rs
@@ -31,7 +31,7 @@ impl_ops_sigmoid_for!(f32, Backend<Native>);
 impl_ops_relu_for!(f32, Backend<Native>);
 impl_ops_tanh_for!(f32, Backend<Native>);
 // impl_ops_convolution_for!(f32, Backend<Native>);
-// impl_ops_softmax_for!(f32, Backend<Native>);
+ impl_ops_softmax_for!(f32, Backend<Native>);
 // impl_ops_lrn_for!(f32, Backend<Native>);
 // impl_ops_pooling_for!(f32, Backend<Native>);
 
@@ -48,6 +48,6 @@ impl_ops_sigmoid_for!(f64, Backend<Native>);
 impl_ops_relu_for!(f64, Backend<Native>);
 impl_ops_tanh_for!(f64, Backend<Native>);
 // impl_ops_convolution_for!(f64, Backend<Native>);
-// impl_ops_softmax_for!(f64, Backend<Native>);
+ impl_ops_softmax_for!(f64, Backend<Native>);
 // impl_ops_lrn_for!(f64, Backend<Native>);
 // impl_ops_pooling_for!(f64, Backend<Native>);
diff --git a/tests/softmax_specs.rs b/tests/softmax_specs.rs
index 985bd3f..3977b33 100644
--- a/tests/softmax_specs.rs
+++ b/tests/softmax_specs.rs
@@ -215,185 +215,173 @@ mod softmax_spec_cuda {
 #[cfg(test)]
 #[cfg(feature = "native")]
 mod softmax_spec_native {
-    // use co::backend::{Backend, BackendConfig};
-    // use co::framework::IFramework;
-    // use co::frameworks::Native;
-    // use co_nn::*;
-    // use co::memory::MemoryType;
-    // use co::tensor::SharedTensor;
-    // use co::plugin::numeric_helpers::{cast, Float};
-    //
-    //
-    // fn get_native_backend() -> Backend<Native> {
-    //     let framework = Native::new();
-    //     let hardwares = framework.hardwares();
-    //     let backend_config = BackendConfig::new(framework, hardwares);
-    //     Backend::new(backend_config).unwrap()
-    // }
-    //
-    // fn write_to_memory<T: Copy>(mem: &mut MemoryType, data: &[T]) {
-    //     match mem {
-    //         &mut MemoryType::Native(ref mut mem) => {
-    //             let mut mem_buffer = mem.as_mut_slice::<T>();
-    //             for (index, datum) in data.iter().enumerate() {
-    //                 mem_buffer[index] = *datum;
-    //             }
-    //         },
-    //         #[cfg(any(feature = "opencl", feature = "cuda"))]
-    //         _ => {}
-    //     }
-    // }
-    //
-    // fn get_memory<T: Float, B: IFramework + Clone>(backend: &Backend<B>) -> (SharedTensor<T>, SharedTensor<T>){
-    //     let val = cast::<f64, T>(1f64).unwrap();
-    //     let mut x = SharedTensor::<T>::new(backend.device(), &(1, 1, 4)).unwrap();
-    //     write_to_memory(x.get_mut(backend.device()).unwrap(), &[val, val, val, val]);
-    //
-    //     let result = SharedTensor::<T>::new(backend.device(), &(1, 1, 4)).unwrap();
-    //
-    //     (x, result)
-    // }
-    //
-    // fn get_grad_memory<T: Float, B: IFramework + Clone>(backend: &Backend<B>) -> (SharedTensor<T>, SharedTensor<T>, SharedTensor<T>){
-    //     let val = cast::<f64, T>(1f64).unwrap();
-    //     let val2 = cast::<f64, T>(2f64).unwrap();
-    //     let mut x = SharedTensor::<T>::new(backend.device(), &(1, 1, 3)).unwrap();
-    //     write_to_memory(x.get_mut(backend.device()).unwrap(), &[val, val, val2]);
-    //
-    //     let mut x_diff = SharedTensor::<T>::new(backend.device(), &(1, 1, 3)).unwrap();
-    //     write_to_memory(x_diff.get_mut(backend.device()).unwrap(), &[val, val, val2]);
-    //
-    //     let result_diff = SharedTensor::<T>::new(backend.device(), &(1, 1, 3)).unwrap();
-    //
-    //     (x, x_diff, result_diff)
-    // }
-
-
-    // #[test]
-    // #[ignore]
-    // fn it_computes_correct_softmax_on_native_for_f32() {
-    //     let backend = get_native_backend();
-    //     let (mut x, mut result) = get_memory::<f32, Native>(&backend);
-    //
-    //     match backend.softmax(&mut x, &mut result) {
-    //         Ok(_) => {
-    //             if let Some(mem) = result.get(backend.device()).unwrap().as_native() {
-    //                 assert_eq!(&[0.25f32, 0.25f32, 0.25f32, 0.25f32], mem.as_slice::<f32>());
-    //             }
-    //         },
-    //         Err(err) => { println!("{:?}", err); assert!(false) }
-    //     }
-    // }
-    //
-    // #[test]
-    // #[ignore]
-    // fn it_computes_correct_softmax_on_native_for_f64() {
-    //     let backend = get_native_backend();
-    //     let (mut x, mut result) = get_memory::<f64, Native>(&backend);
-    //
-    //     match backend.softmax(&mut x, &mut result) {
-    //         Ok(_) => {
-    //             if let Some(mem) = result.get(backend.device()).unwrap().as_native() {
-    //                 assert_eq!(&[0.25f64, 0.25f64, 0.25f64, 0.25f64], mem.as_slice::<f64>());
-    //             }
-    //         },
-    //         Err(err) => { println!("{:?}", err); assert!(false) }
-    //     }
-    // }
-    //
-    // #[test]
-    // #[ignore]
-    // fn it_computes_correct_softmax_on_native_for_f32_plain() {
-    //     let backend = get_native_backend();
-    //     let (mut x, mut result) = get_memory::<f32, Native>(&backend);
-    //
-    //     match backend.softmax_plain(&mut x, &mut result) {
-    //         Ok(_) => {
-    //             if let Some(mem) = result.get(backend.device()).unwrap().as_native() {
-    //                 assert_eq!(&[0.25f32, 0.25f32, 0.25f32, 0.25f32], mem.as_slice::<f32>());
-    //             }
-    //         },
-    //         Err(err) => { println!("{:?}", err); assert!(false) }
-    //     }
-    // }
-    //
-    // #[test]
-    // #[ignore]
-    // fn it_computes_correct_softmax_on_native_for_f64_plain() {
-    //     let backend = get_native_backend();
-    //     let (mut x, mut result) = get_memory::<f64, Native>(&backend);
-    //
-    //     match backend.softmax_plain(&mut x, &mut result) {
-    //         Ok(_) => {
-    //             if let Some(mem) = result.get(backend.device()).unwrap().as_native() {
-    //                 assert_eq!(&[0.25f64, 0.25f64, 0.25f64, 0.25f64], mem.as_slice::<f64>());
-    //             }
-    //         },
-    //         Err(err) => { println!("{:?}", err); assert!(false) }
-    //     }
-    // }
-    //
-    // #[test]
-    // #[ignore]
-    // fn it_computes_correct_softmax_grad_on_native_for_f32() {
-    //     let backend = get_native_backend();
-    //     let (mut x, mut x_diff, mut result_diff) = get_grad_memory::<f32, Native>(&backend);
-    //
-    //     match backend.softmax_grad(&mut x, &mut x_diff, &mut result_diff) {
-    //         Ok(_) => {
-    //             if let Some(mem) = result_diff.get(backend.device()).unwrap().as_native() {
-    //                 assert_eq!(&[-5f32, -5f32, -8f32], mem.as_slice::<f32>());
-    //             }
-    //         },
-    //         Err(err) => { println!("{:?}", err); assert!(false) }
-    //     }
-    // }
-    //
-    // #[test]
-    // #[ignore]
-    // fn it_computes_correct_softmax_grad_on_native_for_f64() {
-    //     let backend = get_native_backend();
-    //     let (mut x, mut x_diff, mut result_diff) = get_grad_memory::<f64, Native>(&backend);
-    //
-    //     match backend.softmax_grad(&mut x, &mut x_diff, &mut result_diff) {
-    //         Ok(_) => {
-    //             if let Some(mem) = result_diff.get(backend.device()).unwrap().as_native() {
-    //                 assert_eq!(&[-5f64, -5f64, -8f64], mem.as_slice::<f64>());
-    //             }
-    //         },
-    //         Err(err) => { println!("{:?}", err); assert!(false) }
-    //     }
-    // }
-    //
-    // #[test]
-    // #[ignore]
-    // fn it_computes_correct_softmax_grad_on_native_for_f32_plain() {
-    //     let backend = get_native_backend();
-    //     let (mut x, mut x_diff, mut result_diff) = get_grad_memory::<f32, Native>(&backend);
-    //
-    //     match backend.softmax_grad_plain(&mut x, &mut x_diff, &mut result_diff) {
-    //         Ok(_) => {
-    //             if let Some(mem) = result_diff.get(backend.device()).unwrap().as_native() {
-    //                 assert_eq!(&[-5f32, -5f32, -8f32], mem.as_slice::<f32>());
-    //             }
-    //         },
-    //         Err(err) => { println!("{:?}", err); assert!(false) }
-    //     }
-    // }
-    //
-    // #[test]
-    // #[ignore]
-    // fn it_computes_correct_softmax_grad_on_native_for_f64_plain() {
-    //     let backend = get_native_backend();
-    //     let (mut x, mut x_diff, mut result_diff) = get_grad_memory::<f64, Native>(&backend);
-    //
-    //     match backend.softmax_grad_plain(&mut x, &mut x_diff, &mut result_diff) {
-    //         Ok(_) => {
-    //             if let Some(mem) = result_diff.get(backend.device()).unwrap().as_native() {
-    //                 assert_eq!(&[-5f64, -5f64, -8f64], mem.as_slice::<f64>());
-    //             }
-    //         },
-    //         Err(err) => { println!("{:?}", err); assert!(false) }
-    //     }
-    // }
+    use co::backend::{Backend, BackendConfig};
+    use co::framework::IFramework;
+    use co::frameworks::Native;
+    use co_nn::*;
+    use co::memory::MemoryType;
+    use co::tensor::SharedTensor;
+    use co::plugin::numeric_helpers::{cast, Float};
+
+
+    fn get_native_backend() -> Backend<Native> {
+        let framework = Native::new();
+        let hardwares = framework.hardwares();
+        let backend_config = BackendConfig::new(framework, hardwares);
+        Backend::new(backend_config).unwrap()
+    }
+
+    fn write_to_memory<T: Copy>(mem: &mut MemoryType, data: &[T]) {
+        let &mut MemoryType::Native(ref mut mem) = mem;
+        let mut mem_buffer = mem.as_mut_slice::<T>();
+        for (index, datum) in data.iter().enumerate() {
+            mem_buffer[index] = *datum;
+        }
+    }
+
+
+    fn get_memory<T: Float, B: IFramework + Clone>(backend: &Backend<B>) -> (SharedTensor<T>, SharedTensor<T>){
+        let val = cast::<f64, T>(1f64).unwrap();
+        let mut x = SharedTensor::<T>::new(backend.device(), &(1, 1, 4)).unwrap();
+        write_to_memory(x.get_mut(backend.device()).unwrap(), &[val, val, val, val]);
+
+        let result = SharedTensor::<T>::new(backend.device(), &(1, 1, 4)).unwrap();
+
+        (x, result)
+    }
+
+    fn get_grad_memory<T: Float, B: IFramework + Clone>(backend: &Backend<B>) -> (SharedTensor<T>, SharedTensor<T>, SharedTensor<T>){
+        let val = cast::<f64, T>(1f64).unwrap();
+        let val2 = cast::<f64, T>(2f64).unwrap();
+        let mut x = SharedTensor::<T>::new(backend.device(), &(1, 1, 3)).unwrap();
+        write_to_memory(x.get_mut(backend.device()).unwrap(), &[val, val, val2]);
+
+        let mut x_diff = SharedTensor::<T>::new(backend.device(), &(1, 1, 3)).unwrap();
+        write_to_memory(x_diff.get_mut(backend.device()).unwrap(), &[val, val, val2]);
+
+        let result_diff = SharedTensor::<T>::new(backend.device(), &(1, 1, 3)).unwrap();
+
+        (x, x_diff, result_diff)
+    }
+
+
+    #[test]
+    fn it_computes_correct_softmax_on_native_for_f32() {
+        let backend = get_native_backend();
+        let (mut x, mut result) = get_memory::<f32, Native>(&backend);
+
+        match backend.softmax(&mut x, &mut result) {
+            Ok(_) => {
+                if let Some(mem) = result.get(backend.device()).unwrap().as_native() {
+                    assert_eq!(&[0.25f32, 0.25f32, 0.25f32, 0.25f32], mem.as_slice::<f32>());
+                }
+            },
+            Err(err) => { println!("{:?}", err); assert!(false) }
+        }
+    }
+
+    #[test]
+    fn it_computes_correct_softmax_on_native_for_f64() {
+        let backend = get_native_backend();
+        let (mut x, mut result) = get_memory::<f64, Native>(&backend);
+
+        match backend.softmax(&mut x, &mut result) {
+            Ok(_) => {
+                if let Some(mem) = result.get(backend.device()).unwrap().as_native() {
+                    assert_eq!(&[0.25f64, 0.25f64, 0.25f64, 0.25f64], mem.as_slice::<f64>());
+                }
+            },
+            Err(err) => { println!("{:?}", err); assert!(false) }
+        }
+    }
+
+    #[test]
+    fn it_computes_correct_softmax_on_native_for_f32_plain() {
+        let backend = get_native_backend();
+        let (mut x, mut result) = get_memory::<f32, Native>(&backend);
+
+        match backend.softmax_plain(&mut x, &mut result) {
+            Ok(_) => {
+                if let Some(mem) = result.get(backend.device()).unwrap().as_native() {
+                    assert_eq!(&[0.25f32, 0.25f32, 0.25f32, 0.25f32], mem.as_slice::<f32>());
+                }
+            },
+            Err(err) => { println!("{:?}", err); assert!(false) }
+        }
+    }
+
+    #[test]
+    fn it_computes_correct_softmax_on_native_for_f64_plain() {
+        let backend = get_native_backend();
+        let (mut x, mut result) = get_memory::<f64, Native>(&backend);
+
+        match backend.softmax_plain(&mut x, &mut result) {
+            Ok(_) => {
+                if let Some(mem) = result.get(backend.device()).unwrap().as_native() {
+                    assert_eq!(&[0.25f64, 0.25f64, 0.25f64, 0.25f64], mem.as_slice::<f64>());
+                }
+            },
+            Err(err) => { println!("{:?}", err); assert!(false) }
+        }
+    }
+
+    #[test]
+    fn it_computes_correct_softmax_grad_on_native_for_f32() {
+        let backend = get_native_backend();
+        let (mut x, mut x_diff, mut result_diff) = get_grad_memory::<f32, Native>(&backend);
+
+        match backend.softmax_grad(&mut x, &mut x_diff, &mut result_diff) {
+            Ok(_) => {
+                if let Some(mem) = result_diff.get(backend.device()).unwrap().as_native() {
+                    assert_eq!(&[-5f32, -5f32, -8f32], mem.as_slice::<f32>());
+                }
+            },
+            Err(err) => { println!("{:?}", err); assert!(false) }
+        }
+    }
+
+    #[test]
+    fn it_computes_correct_softmax_grad_on_native_for_f64() {
+        let backend = get_native_backend();
+        let (mut x, mut x_diff, mut result_diff) = get_grad_memory::<f64, Native>(&backend);
+
+        match backend.softmax_grad(&mut x, &mut x_diff, &mut result_diff) {
+            Ok(_) => {
+                if let Some(mem) = result_diff.get(backend.device()).unwrap().as_native() {
+                    assert_eq!(&[-5f64, -5f64, -8f64], mem.as_slice::<f64>());
+                }
+            },
+            Err(err) => { println!("{:?}", err); assert!(false) }
+        }
+    }
+
+    #[test]
+    fn it_computes_correct_softmax_grad_on_native_for_f32_plain() {
+        let backend = get_native_backend();
+        let (mut x, mut x_diff, mut result_diff) = get_grad_memory::<f32, Native>(&backend);
+
+        match backend.softmax_grad_plain(&mut x, &mut x_diff, &mut result_diff) {
+            Ok(_) => {
+                if let Some(mem) = result_diff.get(backend.device()).unwrap().as_native() {
+                    assert_eq!(&[-5f32, -5f32, -8f32], mem.as_slice::<f32>());
+                }
+            },
+            Err(err) => { println!("{:?}", err); assert!(false) }
+        }
+    }
+
+    #[test]
+    fn it_computes_correct_softmax_grad_on_native_for_f64_plain() {
+        let backend = get_native_backend();
+        let (mut x, mut x_diff, mut result_diff) = get_grad_memory::<f64, Native>(&backend);
+
+        match backend.softmax_grad_plain(&mut x, &mut x_diff, &mut result_diff) {
+            Ok(_) => {
+                if let Some(mem) = result_diff.get(backend.device()).unwrap().as_native() {
+                    assert_eq!(&[-5f64, -5f64, -8f64], mem.as_slice::<f64>());
+                }
+            },
+            Err(err) => { println!("{:?}", err); assert!(false) }
+        }
+    }
 }