diff --git a/.github/workflows/clippy.yml b/.github/workflows/clippy.yml
new file mode 100644
index 0000000..e34788a
--- /dev/null
+++ b/.github/workflows/clippy.yml
@@ -0,0 +1,34 @@
+name: clippy
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+  workflow_dispatch:
+jobs:
+  clippy:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: "recursive"
+      - name: Run sccache-cache
+        uses: mozilla-actions/sccache-action@v0.0.3
+      - name: Run sccache stat for check before
+        shell: bash
+        run: ${SCCACHE_PATH} --show-stats
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: nightly
+      - name: cargo clippy
+        env:
+          SCCACHE_GHA_ENABLED: "true"
+          RUSTC_WRAPPER: "sccache"
+        run: |
+          rustup default nightly
+          rustup component add clippy
+          cargo clippy --all-targets --all-features
+      - name: Run sccache stat for check after
+        shell: bash
+        run: ${SCCACHE_PATH} --show-stats
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000..ac77c12
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,33 @@
+name: docs
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+  workflow_dispatch:
+jobs:
+  docs:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: "recursive"
+      - name: Run sccache-cache
+        uses: mozilla-actions/sccache-action@v0.0.3
+      - name: Run sccache stat for check before
+        shell: bash
+        run: ${SCCACHE_PATH} --show-stats
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: nightly
+      - name: cargo
+        env:
+          SCCACHE_GHA_ENABLED: "true"
+          RUSTC_WRAPPER: "sccache"
+        run: |
+          rustup default nightly
+          cargo doc --no-deps --all-features
+      - name: Run sccache stat for check after
+        shell: bash
+        run: ${SCCACHE_PATH} --show-stats
diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
new file mode 100644
index 0000000..11c6fe1
--- /dev/null
+++ b/.github/workflows/format.yml
@@ -0,0 +1,28 @@
+name: format
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  format:
+      runs-on: ubuntu-22.04
+      steps:
+        - name: Checkout workspace
+          uses: actions/checkout@v3
+        - name: Install pre-commit and install
+          run: |
+            pip install pre-commit
+            pre-commit install
+        - name: Run pre-commit checks
+          run: pre-commit run --all-files
+        - uses: actions-rs/toolchain@v1
+          with:
+            toolchain: nightly
+        - name: cargo doc
+          run: |
+            rustup default nightly
+            cargo doc --no-deps --all-features
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
new file mode 100644
index 0000000..819c99b
--- /dev/null
+++ b/.github/workflows/nightly.yml
@@ -0,0 +1,40 @@
+name: nightly
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+  workflow_dispatch:
+jobs:
+  nightly:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: "recursive"
+      - name: Run sccache-cache
+        uses: mozilla-actions/sccache-action@v0.0.3
+      - name: Run sccache stat for check before
+        shell: bash
+        run: ${SCCACHE_PATH} --show-stats
+      - uses: actions-rs/toolchain@v1
+        with:
+          toolchain: nightly
+      - name: cargo build
+        env:
+          SCCACHE_GHA_ENABLED: "true"
+          RUSTC_WRAPPER: "sccache"
+        run: |
+          rustup default nightly
+          cargo build --release --all-targets --all-features
+      - name: cargo test
+        env:
+          SCCACHE_GHA_ENABLED: "true"
+          RUSTC_WRAPPER: "sccache"
+        run: |
+          rustup default nightly
+          cargo test --release --all-features
+      - name: Run sccache stat for check after
+        shell: bash
+        run: ${SCCACHE_PATH} --show-stats
diff --git a/.github/workflows/main.yml b/.github/workflows/stable.yml
similarity index 72%
rename from .github/workflows/main.yml
rename to .github/workflows/stable.yml
index 86631fc..05c3a9d 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/stable.yml
@@ -1,4 +1,4 @@
-name: main
+name: stable
 on:
   push:
     branches: [main]
@@ -6,10 +6,10 @@ on:
 
   workflow_dispatch:
 jobs:
-  build:
-    runs-on: ubuntu-20.04
+  stable:
+    runs-on: ubuntu-22.04
 
-    steps: 
+    steps:
       - uses: actions/checkout@v3
         with:
           submodules: "recursive"
@@ -21,18 +21,18 @@ jobs:
       - uses: actions-rs/toolchain@v1
         with:
           toolchain: stable
-      - name: cargo
+      - name: cargo build
+        env:
+          SCCACHE_GHA_ENABLED: "true"
+          RUSTC_WRAPPER: "sccache"
+        run: |
+          cargo build --release --all-targets
+      - name: cargo test
         env:
           SCCACHE_GHA_ENABLED: "true"
           RUSTC_WRAPPER: "sccache"
         run: |
-          sudo apt-get update -y
-          sudo apt-get install -y nasm
-          cargo doc --no-deps
-          cargo fmt --check
           cargo test --release
       - name: Run sccache stat for check after
         shell: bash
         run: ${SCCACHE_PATH} --show-stats
-
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..66686d7
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.1.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+        exclude: (thirdparty/.*)|(SOPHUS_VERSION)|(.txt)$
+      - id: check-yaml
+        args: ["--unsafe"]
+      - id: check-json
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.1.0
+    hooks:
+      - id: codespell
+        args:
+          - --ignore-words-list
+          - "te,tring,crate"
diff --git a/Cargo.toml b/Cargo.toml
index 3dd5331..7ad52eb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,22 +16,23 @@ edition = "2021"
 include = [
   "**/*.rs",
   "**/*.wgsl",
-  "Cargo.toml",
+  "**/*.md",
+  "**/Cargo.toml",
 ]
 keywords = ["robotics", "optimization"]
 license = "MIT OR Apache-2.0"
 repository = "https://github.com/farm-ng/sophus-rs/"
-version = "0.6.1"
+version = "0.7.0"
 
 [workspace.dependencies]
-sophus = {path = "crates/sophus", version = "0.6.1"}
-sophus_core = {path = "crates/sophus_core", version = "0.6.1"}
-sophus_image = {path = "crates/sophus_image", version = "0.6.1"}
-sophus_lie = {path = "crates/sophus_lie", version = "0.6.1"}
-sophus_opt = {path = "crates/sophus_opt", version = "0.6.1"}
-sophus_pyo3 = {path = "crates/sophus_pyo3", version = "0.6.1"}
-sophus_sensor = {path = "crates/sophus_sensor", version = "0.6.1"}
-sophus_viewer = {path = "crates/sophus_viewer", version = "0.6.1"}
+sophus = {path = "crates/sophus", version = "0.7.0"}
+sophus_core = {path = "crates/sophus_core", version = "0.7.0"}
+sophus_image = {path = "crates/sophus_image", version = "0.7.0"}
+sophus_lie = {path = "crates/sophus_lie", version = "0.7.0"}
+sophus_opt = {path = "crates/sophus_opt", version = "0.7.0"}
+sophus_pyo3 = {path = "crates/sophus_pyo3", version = "0.7.0"}
+sophus_sensor = {path = "crates/sophus_sensor", version = "0.7.0"}
+sophus_viewer = {path = "crates/sophus_viewer", version = "0.7.0"}
 
 approx = "0.5"
 as-any = "0.3"
@@ -58,7 +59,6 @@ num-traits = "0.2"
 numpy = "0.21"
 png = "0.17"
 rand = "0.8"
-sleef = "0.3"
 tokio = {version = "1", features = ["full"]}
 typenum = {version = "1.17", features = ["const-generics"]}
 wgpu = "0.19"
diff --git a/LICENSE-MIT b/LICENSE-MIT
index 9e82be3..e8864fe 100644
--- a/LICENSE-MIT
+++ b/LICENSE-MIT
@@ -23,4 +23,4 @@ SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
 IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-DEALINGS IN THE SOFTWARE.
\ No newline at end of file
+DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
index 0d69f40..14613b4 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,35 @@
 # sophus-rs
 
-This library is in an early development stage - hence API is highly unstable.
+sophus-rs is a Rust library for 2d and 3d geometry for Computer Vision and Robotics applications.
+It is a spin-off of the [Sophus](https://github.com/strasdat/Sophus) C++ library which
+focuses on **Lie groups** (e.g. rotations and transformations in 2d and 3d).
 
-Sophus-rs currently requires rust nightly. The only nightly feature it uses is `portable-simd`.
-It is the intention to move to rust stable once `portable-simd` is stabilized. There is no current
-plan to depend on other nightly features.
+In addition to Lie groups, sophus-rs also includes other geometric/maths concepts such unit vector,
+splines, image classes, camera models as well as a other utilities such as a non-linear least
+squares optimization.
+
+## Status
+
+This library is in an early development stage - hence API is highly unstable. It is likely that
+existing features will be removed or changed in the future.
+
+However, the intend is to stride for correctness, facilitated using a comprehensive test suite.
+
+## Building
+
+sophus-rs builds on stable.
+
+```toml
+[dependencies]
+sophus = "0.7.0"
+```
+
+To allow for batch types, such as BatchScalarF64, the 'simd' feature is required. This feature
+depends on [`portable-simd`](https://doc.rust-lang.org/std/simd/index.html), which is currently
+only available on [nightly](https://doc.rust-lang.org/book/appendix-07-nightly-rust.html). There
+are plans to use any other nightly features.
+
+```toml
+[dependencies]
+sophus = { version = "0.7.0", features = ["simd"] }
+```
diff --git a/crates/sophus/Cargo.toml b/crates/sophus/Cargo.toml
index 277f003..f23004c 100644
--- a/crates/sophus/Cargo.toml
+++ b/crates/sophus/Cargo.toml
@@ -30,3 +30,13 @@ nalgebra.workspace = true
 ndarray.workspace = true
 tokio.workspace = true
 wgpu.workspace = true
+
+[features]
+simd = [
+    "sophus_core/simd",
+    "sophus_image/simd",
+    "sophus_lie/simd",
+    "sophus_opt/simd",
+    "sophus_sensor/simd",
+    "sophus_viewer/simd",
+]
diff --git a/crates/sophus/src/lib.rs b/crates/sophus/src/lib.rs
index 4ae3763..ccb7f72 100644
--- a/crates/sophus/src/lib.rs
+++ b/crates/sophus/src/lib.rs
@@ -1,6 +1,9 @@
-#![feature(portable_simd)]
+#![cfg_attr(feature = "simd", feature(portable_simd))]
 #![allow(clippy::needless_range_loop)]
 
+#![doc = include_str!(concat!("../", std::env!("CARGO_PKG_README")))]
+
+
 pub mod examples;
 
 #[doc(inline)]
diff --git a/crates/sophus_core/Cargo.toml b/crates/sophus_core/Cargo.toml
index c4786f2..4ca03b5 100644
--- a/crates/sophus_core/Cargo.toml
+++ b/crates/sophus_core/Cargo.toml
@@ -17,5 +17,10 @@ concat-arrays.workspace = true
 nalgebra.workspace = true
 ndarray.workspace = true
 num-traits.workspace = true
-sleef.workspace = true
 typenum.workspace = true
+
+sleef = { version = "0.3", optional = true }
+
+
+[features]
+simd = ["sleef"]
diff --git a/crates/sophus_core/src/calculus/dual.rs b/crates/sophus_core/src/calculus/dual.rs
index bb7ec3a..bf36c03 100644
--- a/crates/sophus_core/src/calculus/dual.rs
+++ b/crates/sophus_core/src/calculus/dual.rs
@@ -1,14 +1,29 @@
-/// DualScalar matrix.
+/// Dual matrix.
 pub mod dual_matrix;
-pub use crate::calculus::dual::dual_matrix::DualBatchMatrix;
 pub use crate::calculus::dual::dual_matrix::DualMatrix;
 
-/// DualScalar scalar.
+#[cfg(feature = "simd")]
+/// Dual batch matrix.
+pub mod dual_batch_matrix;
+#[cfg(feature = "simd")]
+pub use crate::calculus::dual::dual_batch_matrix::DualBatchMatrix;
+
+/// Dual scalar.
 pub mod dual_scalar;
-pub use crate::calculus::dual::dual_scalar::DualBatchScalar;
 pub use crate::calculus::dual::dual_scalar::DualScalar;
 
-/// DualScalar vector.
+#[cfg(feature = "simd")]
+/// Dual batch scalar.
+pub mod dual_batch_scalar;
+#[cfg(feature = "simd")]
+pub use crate::calculus::dual::dual_batch_scalar::DualBatchScalar;
+
+/// Dual vector.
 pub mod dual_vector;
-pub use crate::calculus::dual::dual_vector::DualBatchVector;
 pub use crate::calculus::dual::dual_vector::DualVector;
+
+#[cfg(feature = "simd")]
+/// Dual batch vector.
+pub mod dual_batch_vector;
+#[cfg(feature = "simd")]
+pub use crate::calculus::dual::dual_batch_vector::DualBatchVector;
diff --git a/crates/sophus_core/src/calculus/dual/dual_batch_matrix.rs b/crates/sophus_core/src/calculus/dual/dual_batch_matrix.rs
new file mode 100644
index 0000000..c5ba96b
--- /dev/null
+++ b/crates/sophus_core/src/calculus/dual/dual_batch_matrix.rs
@@ -0,0 +1,694 @@
+use crate::calculus::dual::dual_batch_scalar::DualBatchScalar;
+use crate::calculus::dual::dual_matrix::DijPairMV;
+use crate::calculus::dual::DualBatchVector;
+use crate::linalg::BatchMatF64;
+use crate::linalg::BatchScalarF64;
+use crate::linalg::BatchVecF64;
+use crate::prelude::*;
+use crate::tensor::mut_tensor::MutTensorDD;
+use crate::tensor::mut_tensor::MutTensorDDR;
+use crate::tensor::mut_tensor::MutTensorDDRC;
+use approx::AbsDiffEq;
+use approx::RelativeEq;
+use num_traits::Zero;
+use std::fmt::Debug;
+use std::ops::Add;
+use std::ops::Mul;
+use std::ops::Neg;
+use std::ops::Sub;
+use std::simd::LaneCount;
+use std::simd::Mask;
+use std::simd::SupportedLaneCount;
+
+use crate::calculus::dual::dual_matrix::DijPairM;
+
+/// DualScalarLike matrix
+#[derive(Clone)]
+pub struct DualBatchMatrix<const ROWS: usize, const COLS: usize, const BATCH: usize>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    /// real part
+    pub real_part: BatchMatF64<ROWS, COLS, BATCH>,
+    /// infinitesimal part - represents derivative
+    pub dij_part: Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>>,
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> IsDual
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize>
+    IsDualMatrix<DualBatchScalar<BATCH>, ROWS, COLS, BATCH> for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    /// Create a new dual number
+    fn new_with_dij(val: BatchMatF64<ROWS, COLS, BATCH>) -> Self {
+        let mut dij_val =
+            MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape([ROWS, COLS]);
+        for i in 0..ROWS {
+            for j in 0..COLS {
+                dij_val.mut_view().get_mut([i, j])[(i, j)] = BatchScalarF64::<BATCH>::from_f64(1.0);
+            }
+        }
+
+        Self {
+            real_part: val,
+            dij_part: Some(dij_val),
+        }
+    }
+
+    /// Get the derivative
+    fn dij_val(self) -> Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>> {
+        self.dij_part
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn binary_mm_dij<
+        const R0: usize,
+        const R1: usize,
+        const C0: usize,
+        const C1: usize,
+        F: FnMut(&BatchMatF64<R0, C0, BATCH>) -> BatchMatF64<ROWS, COLS, BATCH>,
+        G: FnMut(&BatchMatF64<R1, C1, BATCH>) -> BatchMatF64<ROWS, COLS, BATCH>,
+    >(
+        lhs_dx: &Option<MutTensorDDRC<BatchScalarF64<BATCH>, R0, C0>>,
+        rhs_dx: &Option<MutTensorDDRC<BatchScalarF64<BATCH>, R1, C1>>,
+        mut left_op: F,
+        mut right_op: G,
+    ) -> Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>> {
+        match (lhs_dx, rhs_dx) {
+            (None, None) => None,
+            (None, Some(rhs_dij)) => {
+                let out_dij = MutTensorDDRC::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), None) => {
+                let out_dij = MutTensorDDRC::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), Some(rhs_dij)) => {
+                let dyn_mat =
+                    MutTensorDDRC::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
+                        left_op(l_dij) + right_op(r_dij)
+                    });
+                Some(dyn_mat)
+            }
+        }
+    }
+
+    fn binary_mv_dij<
+        const R0: usize,
+        const R1: usize,
+        const C0: usize,
+        F: FnMut(&BatchMatF64<R0, C0, BATCH>) -> BatchVecF64<ROWS, BATCH>,
+        G: FnMut(&BatchVecF64<R1, BATCH>) -> BatchVecF64<ROWS, BATCH>,
+    >(
+        lhs_dx: &Option<MutTensorDDRC<BatchScalarF64<BATCH>, R0, C0>>,
+        rhs_dx: &Option<MutTensorDDR<BatchScalarF64<BATCH>, R1>>,
+        mut left_op: F,
+        mut right_op: G,
+    ) -> Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>> {
+        match (lhs_dx, rhs_dx) {
+            (None, None) => None,
+            (None, Some(rhs_dij)) => {
+                let out_dij = MutTensorDDR::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), None) => {
+                let out_dij = MutTensorDDR::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), Some(rhs_dij)) => {
+                let dyn_mat =
+                    MutTensorDDR::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
+                        left_op(l_dij) + right_op(r_dij)
+                    });
+                Some(dyn_mat)
+            }
+        }
+    }
+
+    fn binary_ms_dij<
+        const R0: usize,
+        const C0: usize,
+        F: FnMut(&BatchMatF64<R0, C0, BATCH>) -> BatchMatF64<ROWS, COLS, BATCH>,
+        G: FnMut(&BatchScalarF64<BATCH>) -> BatchMatF64<ROWS, COLS, BATCH>,
+    >(
+        lhs_dx: &Option<MutTensorDDRC<BatchScalarF64<BATCH>, R0, C0>>,
+        rhs_dx: &Option<MutTensorDD<BatchScalarF64<BATCH>>>,
+        mut left_op: F,
+        mut right_op: G,
+    ) -> Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>> {
+        match (lhs_dx, rhs_dx) {
+            (None, None) => None,
+            (None, Some(rhs_dij)) => {
+                let out_dij = MutTensorDDRC::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), None) => {
+                let out_dij = MutTensorDDRC::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), Some(rhs_dij)) => {
+                let dyn_mat =
+                    MutTensorDDRC::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
+                        left_op(l_dij) + right_op(r_dij)
+                    });
+                Some(dyn_mat)
+            }
+        }
+    }
+
+    /// derivatives
+    pub fn two_dx<const R1: usize, const C1: usize, const R2: usize, const C2: usize>(
+        mut lhs_dx: Option<MutTensorDDRC<BatchScalarF64<BATCH>, R1, C1>>,
+        mut rhs_dx: Option<MutTensorDDRC<BatchScalarF64<BATCH>, R2, C2>>,
+    ) -> Option<DijPairM<BatchScalarF64<BATCH>, R1, C1, R2, C2>> {
+        if lhs_dx.is_none() && rhs_dx.is_none() {
+            return None;
+        }
+
+        if lhs_dx.is_some() && rhs_dx.is_some() {
+            assert_eq!(
+                lhs_dx.clone().unwrap().dims(),
+                rhs_dx.clone().unwrap().dims()
+            );
+        }
+
+        if lhs_dx.is_none() {
+            lhs_dx = Some(MutTensorDDRC::<BatchScalarF64<BATCH>, R1, C1>::from_shape(
+                rhs_dx.clone().unwrap().dims(),
+            ))
+        } else if rhs_dx.is_none() {
+            rhs_dx = Some(MutTensorDDRC::<BatchScalarF64<BATCH>, R2, C2>::from_shape(
+                lhs_dx.clone().unwrap().dims(),
+            ))
+        }
+
+        Some(DijPairM {
+            lhs: lhs_dx.unwrap(),
+            rhs: rhs_dx.unwrap(),
+        })
+    }
+
+    /// derivatives
+    pub fn two_dx_from_vec(
+        mut lhs_dx: Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>>,
+        mut rhs_dx: Option<MutTensorDDR<BatchScalarF64<BATCH>, COLS>>,
+    ) -> Option<DijPairMV<BatchScalarF64<BATCH>, ROWS, COLS>> {
+        if lhs_dx.is_none() && rhs_dx.is_none() {
+            return None;
+        }
+
+        if lhs_dx.is_some() && rhs_dx.is_some() {
+            assert_eq!(
+                lhs_dx.clone().unwrap().dims(),
+                rhs_dx.clone().unwrap().dims()
+            );
+        }
+
+        if lhs_dx.is_none() {
+            lhs_dx = Some(
+                MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(
+                    rhs_dx.clone().unwrap().dims(),
+                ),
+            )
+        } else if rhs_dx.is_none() {
+            rhs_dx = Some(MutTensorDDR::<BatchScalarF64<BATCH>, COLS>::from_shape(
+                lhs_dx.clone().unwrap().dims(),
+            ))
+        }
+
+        Some(DijPairMV::<BatchScalarF64<BATCH>, ROWS, COLS> {
+            lhs: lhs_dx.unwrap(),
+            rhs: rhs_dx.unwrap(),
+        })
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> PartialEq
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn eq(&self, other: &Self) -> bool {
+        self.real_part == other.real_part && self.dij_part == other.dij_part
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> AbsDiffEq
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Epsilon = f64;
+
+    fn default_epsilon() -> Self::Epsilon {
+        f64::default_epsilon()
+    }
+
+    fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool {
+        self.real_part.abs_diff_eq(&other.real_part, epsilon)
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> RelativeEq
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn default_max_relative() -> Self::Epsilon {
+        f64::default_max_relative()
+    }
+
+    fn relative_eq(
+        &self,
+        other: &Self,
+        epsilon: Self::Epsilon,
+        max_relative: Self::Epsilon,
+    ) -> bool {
+        self.real_part
+            .relative_eq(&other.real_part, epsilon, max_relative)
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize>
+    IsMatrix<DualBatchScalar<BATCH>, ROWS, COLS, BATCH> for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn from_f64(val: f64) -> Self {
+        DualBatchMatrix {
+            real_part: BatchMatF64::<ROWS, COLS, BATCH>::from_f64(val),
+            dij_part: None,
+        }
+    }
+
+    fn set_elem(&mut self, idx: [usize; 2], val: DualBatchScalar<BATCH>) {
+        self.real_part.set_elem(idx, val.real_part);
+        if self.dij_part.is_some() {
+            let dij = &mut self.dij_part.as_mut().unwrap();
+            for i in 0..dij.dims()[0] {
+                for j in 0..dij.dims()[1] {
+                    dij.mut_view().get_mut([i, j])[(idx[0], idx[1])] =
+                        val.dij_part.clone().unwrap().get([i, j]);
+                }
+            }
+        }
+    }
+
+    fn from_scalar(val: DualBatchScalar<BATCH>) -> Self {
+        DualBatchMatrix {
+            real_part: BatchMatF64::<ROWS, COLS, BATCH>::from_scalar(val.real_part),
+            dij_part: val.dij_part.map(|dij_val| {
+                MutTensorDDRC::from_map(&dij_val.view(), |v| {
+                    BatchMatF64::<ROWS, COLS, BATCH>::from_scalar(*v)
+                })
+            }),
+        }
+    }
+
+    fn mat_mul<const COLS2: usize>(
+        &self,
+        rhs: DualBatchMatrix<COLS, COLS2, BATCH>,
+    ) -> DualBatchMatrix<ROWS, COLS2, BATCH> {
+        DualBatchMatrix {
+            real_part: self.real_part * rhs.real_part,
+            dij_part: DualBatchMatrix::<ROWS, COLS2, BATCH>::binary_mm_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| l_dij * rhs.real_part,
+                |r_dij| self.real_part * r_dij,
+            ),
+        }
+    }
+
+    fn from_real_matrix(val: BatchMatF64<ROWS, COLS, BATCH>) -> Self {
+        Self {
+            real_part: val,
+            dij_part: None,
+        }
+    }
+
+    fn scaled(&self, s: DualBatchScalar<BATCH>) -> Self {
+        DualBatchMatrix {
+            real_part: self.real_part * s.real_part,
+            dij_part: DualBatchMatrix::<ROWS, COLS, BATCH>::binary_ms_dij(
+                &self.dij_part,
+                &s.dij_part,
+                |l_dij| l_dij * s.real_part,
+                |r_dij| self.real_part * *r_dij,
+            ),
+        }
+    }
+
+    fn identity() -> Self {
+        DualBatchMatrix::from_real_matrix(BatchMatF64::<ROWS, COLS, BATCH>::identity())
+    }
+
+    fn get_elem(&self, idx: [usize; 2]) -> DualBatchScalar<BATCH> {
+        DualBatchScalar::<BATCH> {
+            real_part: self.real_part.get_elem(idx),
+            dij_part: self
+                .dij_part
+                .clone()
+                .map(|dij_val| MutTensorDD::from_map(&dij_val.view(), |v| v[(idx[0], idx[1])])),
+        }
+    }
+
+    fn from_array2(duals: [[DualBatchScalar<BATCH>; COLS]; ROWS]) -> Self {
+        let mut shape = None;
+        let mut val_mat = BatchMatF64::<ROWS, COLS, BATCH>::zeros();
+        for i in 0..duals.len() {
+            let d_rows = duals[i].clone();
+            for j in 0..d_rows.len() {
+                let d = d_rows.clone()[j].clone();
+
+                val_mat[(i, j)] = d.real_part;
+                if d.dij_part.is_some() {
+                    shape = Some(d.dij_part.clone().unwrap().dims());
+                }
+            }
+        }
+
+        if shape.is_none() {
+            return DualBatchMatrix {
+                real_part: val_mat,
+                dij_part: None,
+            };
+        }
+        let shape = shape.unwrap();
+
+        let mut r = MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(shape);
+        for i in 0..duals.len() {
+            let d_rows = duals[i].clone();
+            for j in 0..d_rows.len() {
+                let d = d_rows.clone()[j].clone();
+                if d.dij_part.is_some() {
+                    for d0 in 0..shape[0] {
+                        for d1 in 0..shape[1] {
+                            r.mut_view().get_mut([d0, d1])[(i, j)] =
+                                d.dij_part.clone().unwrap().get([d0, d1]);
+                        }
+                    }
+                }
+            }
+        }
+        DualBatchMatrix {
+            real_part: val_mat,
+            dij_part: Some(r),
+        }
+    }
+
+    fn real_matrix(&self) -> &BatchMatF64<ROWS, COLS, BATCH> {
+        &self.real_part
+    }
+
+    fn block_mat2x2<const R0: usize, const R1: usize, const C0: usize, const C1: usize>(
+        top_row: (
+            DualBatchMatrix<R0, C0, BATCH>,
+            DualBatchMatrix<R0, C1, BATCH>,
+        ),
+        bot_row: (
+            DualBatchMatrix<R1, C0, BATCH>,
+            DualBatchMatrix<R1, C1, BATCH>,
+        ),
+    ) -> Self {
+        assert_eq!(R0 + R1, ROWS);
+        assert_eq!(C0 + C1, COLS);
+
+        Self::block_mat2x1(
+            DualBatchMatrix::<R0, COLS, BATCH>::block_mat1x2(top_row.0, top_row.1),
+            DualBatchMatrix::<R1, COLS, BATCH>::block_mat1x2(bot_row.0, bot_row.1),
+        )
+    }
+
+    fn block_mat2x1<const R0: usize, const R1: usize>(
+        top_row: DualBatchMatrix<R0, COLS, BATCH>,
+        bot_row: DualBatchMatrix<R1, COLS, BATCH>,
+    ) -> Self {
+        assert_eq!(R0 + R1, ROWS);
+        let maybe_dij = Self::two_dx(top_row.dij_part, bot_row.dij_part);
+
+        Self {
+            real_part: BatchMatF64::<ROWS, COLS, BATCH>::block_mat2x1(
+                top_row.real_part,
+                bot_row.real_part,
+            ),
+            dij_part: match maybe_dij {
+                Some(dij_val) => {
+                    let mut r = MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(
+                        dij_val.shape(),
+                    );
+                    for d0 in 0..dij_val.shape()[0] {
+                        for d1 in 0..dij_val.shape()[1] {
+                            *r.mut_view().get_mut([d0, d1]) =
+                                BatchMatF64::<ROWS, COLS, BATCH>::block_mat2x1(
+                                    dij_val.lhs.get([d0, d1]),
+                                    dij_val.rhs.get([d0, d1]),
+                                );
+                        }
+                    }
+                    Some(r)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn block_mat1x2<const C0: usize, const C1: usize>(
+        left_col: DualBatchMatrix<ROWS, C0, BATCH>,
+        righ_col: DualBatchMatrix<ROWS, C1, BATCH>,
+    ) -> Self {
+        assert_eq!(C0 + C1, COLS);
+        let maybe_dij = Self::two_dx(left_col.dij_part, righ_col.dij_part);
+
+        Self {
+            real_part: BatchMatF64::<ROWS, COLS, BATCH>::block_mat1x2(
+                left_col.real_part,
+                righ_col.real_part,
+            ),
+            dij_part: match maybe_dij {
+                Some(dij_val) => {
+                    let mut r = MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(
+                        dij_val.shape(),
+                    );
+                    for d0 in 0..dij_val.shape()[0] {
+                        for d1 in 0..dij_val.shape()[1] {
+                            *r.mut_view().get_mut([d0, d1]) =
+                                BatchMatF64::<ROWS, COLS, BATCH>::block_mat1x2(
+                                    dij_val.lhs.get([d0, d1]),
+                                    dij_val.rhs.get([d0, d1]),
+                                );
+                        }
+                    }
+                    Some(r)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn get_fixed_submat<const R: usize, const C: usize>(
+        &self,
+        start_r: usize,
+        start_c: usize,
+    ) -> DualBatchMatrix<R, C, BATCH> {
+        DualBatchMatrix {
+            real_part: self.real_part.get_fixed_submat(start_r, start_c),
+            dij_part: self.dij_part.clone().map(|dij_val| {
+                MutTensorDDRC::from_map(&dij_val.view(), |v| v.get_fixed_submat(start_r, start_c))
+            }),
+        }
+    }
+
+    fn get_col_vec(&self, start_r: usize) -> DualBatchVector<ROWS, BATCH> {
+        DualBatchVector {
+            real_part: self.real_part.get_col_vec(start_r),
+            dij_part: self
+                .dij_part
+                .clone()
+                .map(|dij_val| MutTensorDDR::from_map(&dij_val.view(), |v| v.get_col_vec(start_r))),
+        }
+    }
+
+    fn get_row_vec(&self, c: usize) -> DualBatchVector<COLS, BATCH> {
+        DualBatchVector {
+            real_part: self.real_part.get_row_vec(c),
+            dij_part: self
+                .dij_part
+                .clone()
+                .map(|dij_val| MutTensorDDR::from_map(&dij_val.view(), |v| v.get_row_vec(c))),
+        }
+    }
+
+    fn from_real_scalar_array2(vals: [[BatchScalarF64<BATCH>; COLS]; ROWS]) -> Self {
+        DualBatchMatrix {
+            real_part: BatchMatF64::from_real_scalar_array2(vals),
+            dij_part: None,
+        }
+    }
+
+    fn from_f64_array2(vals: [[f64; COLS]; ROWS]) -> Self {
+        DualBatchMatrix {
+            real_part: BatchMatF64::from_f64_array2(vals),
+            dij_part: None,
+        }
+    }
+
+    fn set_col_vec(
+        &mut self,
+        c: usize,
+        v: <DualBatchScalar<BATCH> as IsScalar<BATCH>>::Vector<ROWS>,
+    ) {
+        self.real_part.set_col_vec(c, v.real_part);
+        todo!();
+    }
+
+    fn to_dual(self) -> <DualBatchScalar<BATCH> as IsScalar<BATCH>>::DualMatrix<ROWS, COLS> {
+        self
+    }
+
+    fn select(self, mask: &Mask<i64, BATCH>, other: Self) -> Self {
+        let maybe_dij = Self::two_dx(self.dij_part, other.dij_part);
+
+        DualBatchMatrix {
+            real_part: self.real_part.select(mask, other.real_part),
+            dij_part: match maybe_dij {
+                Some(dij) => {
+                    let mut r =
+                        MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(dij.shape());
+                    for i in 0..dij.shape()[0] {
+                        for j in 0..dij.shape()[1] {
+                            *r.get_mut([i, j]) =
+                                dij.lhs.get([i, j]).select(mask, dij.rhs.get([i, j]));
+                        }
+                    }
+                    Some(r)
+                }
+                _ => None,
+            },
+        }
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Add
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchMatrix<ROWS, COLS, BATCH>;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        DualBatchMatrix {
+            real_part: self.real_part + rhs.real_part,
+            dij_part: Self::binary_mm_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| *l_dij,
+                |r_dij| *r_dij,
+            ),
+        }
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Sub
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchMatrix<ROWS, COLS, BATCH>;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        DualBatchMatrix {
+            real_part: self.real_part - rhs.real_part,
+            dij_part: Self::binary_mm_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| *l_dij,
+                |r_dij| -r_dij,
+            ),
+        }
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Neg
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchMatrix<ROWS, COLS, BATCH>;
+
+    fn neg(self) -> Self::Output {
+        DualBatchMatrix {
+            real_part: -self.real_part,
+            dij_part: self
+                .dij_part
+                .clone()
+                .map(|dij_val| MutTensorDDRC::from_map(&dij_val.view(), |v| -v)),
+        }
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Zero
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn zero() -> Self {
+        Self::from_real_matrix(BatchMatF64::zeros())
+    }
+
+    fn is_zero(&self) -> bool {
+        self.real_part.is_zero()
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Mul<DualBatchVector<COLS, BATCH>>
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchVector<ROWS, BATCH>;
+
+    fn mul(self, rhs: DualBatchVector<COLS, BATCH>) -> Self::Output {
+        Self::Output {
+            real_part: self.real_part * rhs.real_part,
+            dij_part: Self::binary_mv_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| l_dij * rhs.real_part,
+                |r_dij| self.real_part * r_dij,
+            ),
+        }
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Debug
+    for DualBatchMatrix<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.dij_part.is_some() {
+            f.debug_struct("DualScalarLike")
+                .field("val", &self.real_part)
+                .field("dij_val", &self.dij_part.as_ref().unwrap().elem_view())
+                .finish()
+        } else {
+            f.debug_struct("DualScalarLike")
+                .field("val", &self.real_part)
+                .finish()
+        }
+    }
+}
diff --git a/crates/sophus_core/src/calculus/dual/dual_batch_scalar.rs b/crates/sophus_core/src/calculus/dual/dual_batch_scalar.rs
new file mode 100644
index 0000000..b7f14c0
--- /dev/null
+++ b/crates/sophus_core/src/calculus/dual/dual_batch_scalar.rs
@@ -0,0 +1,739 @@
+use super::dual_scalar::DualScalar;
+pub use crate::calculus::dual::dual_batch_matrix::DualBatchMatrix;
+pub use crate::calculus::dual::dual_batch_vector::DualBatchVector;
+use crate::linalg::scalar::NumberCategory;
+use crate::linalg::BatchMatF64;
+use crate::linalg::BatchScalarF64;
+use crate::linalg::BatchVecF64;
+use crate::prelude::*;
+use crate::tensor::mut_tensor::InnerScalarToVec;
+use crate::tensor::mut_tensor::MutTensorDD;
+use approx::assert_abs_diff_eq;
+use approx::AbsDiffEq;
+use approx::RelativeEq;
+use num_traits::One;
+use num_traits::Zero;
+use std::fmt::Debug;
+use std::ops::Add;
+use std::ops::AddAssign;
+use std::ops::Div;
+use std::ops::Mul;
+use std::ops::Neg;
+use std::ops::Sub;
+use std::ops::SubAssign;
+use std::simd::LaneCount;
+use std::simd::Mask;
+use std::simd::SupportedLaneCount;
+
+/// Dual number - a real number and an infinitesimal number (batch version)
+#[derive(Clone)]
+pub struct DualBatchScalar<const BATCH: usize>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    /// real part
+    pub real_part: BatchScalarF64<BATCH>,
+
+    /// infinitesimal part - represents derivative
+    pub dij_part: Option<MutTensorDD<BatchScalarF64<BATCH>>>,
+}
+
+impl<const BATCH: usize> IsDual for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+}
+
+impl<const BATCH: usize> IsDualScalar<BATCH> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn new_with_dij(val: BatchScalarF64<BATCH>) -> Self {
+        let dij_val = MutTensorDD::from_shape_and_val([1, 1], BatchScalarF64::<BATCH>::ones());
+        Self {
+            real_part: val,
+            dij_part: Some(dij_val),
+        }
+    }
+
+    fn dij_val(self) -> Option<MutTensorDD<BatchScalarF64<BATCH>>> {
+        self.dij_part
+    }
+
+    fn vector_with_dij<const ROWS: usize>(val: Self::RealVector<ROWS>) -> Self::Vector<ROWS> {
+        DualBatchVector::<ROWS, BATCH>::new_with_dij(val)
+    }
+
+    fn matrix_with_dij<const ROWS: usize, const COLS: usize>(
+        val: Self::RealMatrix<ROWS, COLS>,
+    ) -> Self::Matrix<ROWS, COLS> {
+        DualBatchMatrix::<ROWS, COLS, BATCH>::new_with_dij(val)
+    }
+}
+
+impl<const BATCH: usize> IsCoreScalar for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn number_category() -> NumberCategory {
+        NumberCategory::Real
+    }
+}
+
+impl<const BATCH: usize> AsRef<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn as_ref(&self) -> &DualBatchScalar<BATCH>
+    where
+        BatchScalarF64<BATCH>: IsCoreScalar,
+        LaneCount<BATCH>: SupportedLaneCount,
+    {
+        self
+    }
+}
+
+impl<const BATCH: usize> One for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn one() -> Self {
+        <DualBatchScalar<BATCH>>::from_f64(1.0)
+    }
+}
+
+impl<const BATCH: usize> Zero for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn zero() -> Self {
+        <DualBatchScalar<BATCH>>::from_f64(0.0)
+    }
+
+    fn is_zero(&self) -> bool {
+        self.real_part == <DualBatchScalar<BATCH>>::from_f64(0.0).real_part()
+    }
+}
+
+impl<const BATCH: usize> DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn binary_dij<
+        F: FnMut(&BatchScalarF64<BATCH>) -> BatchScalarF64<BATCH>,
+        G: FnMut(&BatchScalarF64<BATCH>) -> BatchScalarF64<BATCH>,
+    >(
+        lhs_dx: &Option<MutTensorDD<BatchScalarF64<BATCH>>>,
+        rhs_dx: &Option<MutTensorDD<BatchScalarF64<BATCH>>>,
+        mut left_op: F,
+        mut right_op: G,
+    ) -> Option<MutTensorDD<BatchScalarF64<BATCH>>> {
+        match (lhs_dx, rhs_dx) {
+            (None, None) => None,
+            (None, Some(rhs_dij)) => {
+                let out_dij =
+                    MutTensorDD::from_map(&rhs_dij.view(), |r_dij: &BatchScalarF64<BATCH>| {
+                        right_op(r_dij)
+                    });
+                Some(out_dij)
+            }
+            (Some(lhs_dij), None) => {
+                let out_dij =
+                    MutTensorDD::from_map(&lhs_dij.view(), |l_dij: &BatchScalarF64<BATCH>| {
+                        left_op(l_dij)
+                    });
+                Some(out_dij)
+            }
+            (Some(lhs_dij), Some(rhs_dij)) => {
+                let dyn_mat = MutTensorDD::from_map2(
+                    &lhs_dij.view(),
+                    &rhs_dij.view(),
+                    |l_dij: &BatchScalarF64<BATCH>, r_dij: &BatchScalarF64<BATCH>| {
+                        left_op(l_dij) + right_op(r_dij)
+                    },
+                );
+                Some(dyn_mat)
+            }
+        }
+    }
+}
+
+impl<const BATCH: usize> Neg for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchScalar<BATCH>;
+
+    fn neg(self) -> Self {
+        Self {
+            real_part: -self.real_part,
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |v: &BatchScalarF64<BATCH>| -*v);
+
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+}
+
+impl<const BATCH: usize> PartialEq for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn eq(&self, other: &Self) -> bool {
+        self.real_part == other.real_part
+    }
+}
+
+impl<const BATCH: usize> AbsDiffEq for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Epsilon = f64;
+
+    fn default_epsilon() -> Self::Epsilon {
+        f64::default_epsilon()
+    }
+
+    fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool {
+        for i in 0..BATCH {
+            if !self.real_part.extract_single(i).abs_diff_eq(
+                &other.real_part.extract_single(i),
+                epsilon.extract_single(i),
+            ) {
+                return false;
+            }
+        }
+        true
+    }
+}
+
+impl<const BATCH: usize> RelativeEq for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn default_max_relative() -> Self::Epsilon {
+        f64::default_max_relative()
+    }
+
+    fn relative_eq(
+        &self,
+        other: &Self,
+        epsilon: Self::Epsilon,
+        max_relative: Self::Epsilon,
+    ) -> bool {
+        for i in 0..BATCH {
+            if !self.real_part.extract_single(i).relative_eq(
+                &other.real_part.extract_single(i),
+                epsilon.extract_single(i),
+                max_relative.extract_single(i),
+            ) {
+                return false;
+            }
+        }
+        true
+    }
+}
+
+impl<const BATCH: usize> From<BatchScalarF64<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn from(value: BatchScalarF64<BATCH>) -> Self {
+        Self::from_real_scalar(value)
+    }
+}
+
+impl<const BATCH: usize> Debug for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.dij_part.is_some() {
+            f.debug_struct("DualScalar")
+                .field("val", &self.real_part)
+                .field("dij_val", &self.dij_part.as_ref().unwrap().elem_view())
+                .finish()
+        } else {
+            f.debug_struct("DualScalar")
+                .field("val", &self.real_part)
+                .finish()
+        }
+    }
+}
+
+impl<const BATCH: usize> IsScalar<BATCH> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Scalar = DualBatchScalar<BATCH>;
+    type RealScalar = BatchScalarF64<BATCH>;
+    type SingleScalar = DualScalar;
+    type DualVector<const ROWS: usize> = DualBatchVector<ROWS, BATCH>;
+    type DualMatrix<const ROWS: usize, const COLS: usize> = DualBatchMatrix<ROWS, COLS, BATCH>;
+    type RealVector<const ROWS: usize> = BatchVecF64<ROWS, BATCH>;
+    type RealMatrix<const ROWS: usize, const COLS: usize> = BatchMatF64<ROWS, COLS, BATCH>;
+    type Vector<const ROWS: usize> = DualBatchVector<ROWS, BATCH>;
+    type Matrix<const ROWS: usize, const COLS: usize> = DualBatchMatrix<ROWS, COLS, BATCH>;
+
+    type Mask = Mask<i64, BATCH>;
+
+    fn from_real_scalar(val: BatchScalarF64<BATCH>) -> Self {
+        Self {
+            real_part: val,
+            dij_part: None,
+        }
+    }
+
+    fn scalar_examples() -> Vec<Self> {
+        [1.0, 2.0, 3.0].iter().map(|&v| Self::from_f64(v)).collect()
+    }
+
+    fn from_real_array(arr: [f64; BATCH]) -> Self {
+        Self::from_real_scalar(BatchScalarF64::<BATCH>::from_real_array(arr))
+    }
+
+    fn to_real_array(&self) -> [f64; BATCH] {
+        self.real_part.to_real_array()
+    }
+
+    fn extract_single(&self, i: usize) -> Self::SingleScalar {
+        Self::SingleScalar {
+            real_part: self.real_part.extract_single(i),
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
+                            dij.extract_single(i)
+                        });
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn cos(self) -> DualBatchScalar<BATCH>
+    where
+        BatchScalarF64<BATCH>: IsCoreScalar,
+        LaneCount<BATCH>: SupportedLaneCount,
+    {
+        Self {
+            real_part: self.real_part.cos(),
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
+                            -*dij * self.real_part.sin()
+                        });
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn signum(&self) -> Self {
+        Self {
+            real_part: self.real_part.signum(),
+            dij_part: None,
+        }
+    }
+
+    fn sin(self) -> DualBatchScalar<BATCH>
+    where
+        BatchScalarF64<BATCH>: IsCoreScalar,
+        LaneCount<BATCH>: SupportedLaneCount,
+    {
+        Self {
+            real_part: self.real_part.sin(),
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
+                            *dij * self.real_part.cos()
+                        });
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn abs(self) -> Self {
+        Self {
+            real_part: self.real_part.abs(),
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
+                            *dij * self.real_part.signum()
+                        });
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn atan2(self, rhs: Self) -> Self {
+        let inv_sq_nrm: BatchScalarF64<BATCH> = BatchScalarF64::ones()
+            / (self.real_part * self.real_part + rhs.real_part * rhs.real_part);
+        Self {
+            real_part: self.real_part.atan2(rhs.real_part),
+            dij_part: Self::binary_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| inv_sq_nrm * ((*l_dij) * rhs.real_part),
+                |r_dij| -inv_sq_nrm * (self.real_part * (*r_dij)),
+            ),
+        }
+    }
+
+    fn real_part(&self) -> BatchScalarF64<BATCH> {
+        self.real_part
+    }
+
+    fn sqrt(self) -> Self {
+        let sqrt = self.real_part.sqrt();
+        Self {
+            real_part: sqrt,
+            dij_part: match self.dij_part {
+                Some(dij) => {
+                    let out_dij =
+                        MutTensorDD::from_map(&dij.view(), |dij: &BatchScalarF64<BATCH>| {
+                            *dij * BatchScalarF64::<BATCH>::from_f64(1.0)
+                                / (BatchScalarF64::<BATCH>::from_f64(2.0) * sqrt)
+                        });
+                    Some(out_dij)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn to_vec(self) -> DualBatchVector<1, BATCH> {
+        DualBatchVector::<1, BATCH> {
+            real_part: self.real_part.real_part().to_vec(),
+            dij_part: match self.dij_part {
+                Some(dij) => {
+                    let tmp = dij.inner_scalar_to_vec();
+                    Some(tmp)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn tan(self) -> Self {
+        Self {
+            real_part: self.real_part.tan(),
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let c = self.real_part.cos();
+                    let sec_squared = BatchScalarF64::<BATCH>::ones() / (c * c);
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
+                            *dij * sec_squared
+                        });
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn acos(self) -> Self {
+        Self {
+            real_part: self.real_part.acos(),
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let dval = -BatchScalarF64::<BATCH>::ones()
+                        / (BatchScalarF64::<BATCH>::ones() - self.real_part * self.real_part)
+                            .sqrt();
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
+                            *dij * dval
+                        });
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn asin(self) -> Self {
+        Self {
+            real_part: self.real_part.asin(),
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let dval = BatchScalarF64::<BATCH>::ones()
+                        / (BatchScalarF64::<BATCH>::ones() - self.real_part * self.real_part)
+                            .sqrt();
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
+                            *dij * dval
+                        });
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn atan(self) -> Self {
+        Self {
+            real_part: self.real_part.atan(),
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let dval = BatchScalarF64::<BATCH>::ones()
+                        / (BatchScalarF64::<BATCH>::ones() + self.real_part * self.real_part);
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
+                            *dij * dval
+                        });
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn fract(self) -> Self {
+        Self {
+            real_part: self.real_part.fract(),
+            dij_part: match self.dij_part.clone() {
+                Some(dij_val) => {
+                    let dyn_mat =
+                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| *dij);
+                    Some(dyn_mat)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn floor(&self) -> BatchScalarF64<BATCH> {
+        self.real_part.floor()
+    }
+
+    fn from_f64(val: f64) -> Self {
+        Self::from_real_scalar(BatchScalarF64::<BATCH>::from_f64(val))
+    }
+
+    type DualScalar = Self;
+
+    fn ones() -> Self {
+        Self::from_f64(1.0)
+    }
+
+    fn zeros() -> Self {
+        Self::from_f64(0.0)
+    }
+
+    fn test_suite() {
+        let examples = Self::scalar_examples();
+        for a in &examples {
+            let sin_a = a.clone().sin();
+            let cos_a = a.clone().cos();
+            let val = sin_a.clone() * sin_a + cos_a.clone() * cos_a;
+            let one = Self::ones();
+
+            for i in 0..BATCH {
+                assert_abs_diff_eq!(val.extract_single(i), one.extract_single(i));
+            }
+        }
+    }
+
+    fn less_equal(&self, rhs: &Self) -> Self::Mask {
+        self.real_part.less_equal(&rhs.real_part)
+    }
+
+    fn to_dual(self) -> Self::DualScalar {
+        self
+    }
+
+    fn select(self, mask: &Self::Mask, other: Self) -> Self {
+        Self {
+            real_part: self.real_part.select(mask, other.real_part),
+            dij_part: match (self.dij_part, other.dij_part) {
+                (Some(lhs), Some(rhs)) => {
+                    let dyn_mat = MutTensorDD::from_map2(
+                        &lhs.view(),
+                        &rhs.view(),
+                        |l: &BatchScalarF64<BATCH>, r: &BatchScalarF64<BATCH>| l.select(mask, *r),
+                    );
+                    Some(dyn_mat)
+                }
+                _ => None,
+            },
+        }
+    }
+
+    fn greater_equal(&self, rhs: &Self) -> Self::Mask {
+        self.real_part.greater_equal(&rhs.real_part)
+    }
+}
+
+impl<const BATCH: usize> AddAssign<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn add_assign(&mut self, rhs: Self) {
+        *self = self.clone().add(&rhs);
+    }
+}
+impl<const BATCH: usize> SubAssign<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = self.clone().sub(&rhs);
+    }
+}
+
+impl<const BATCH: usize> Add<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchScalar<BATCH>;
+    fn add(self, rhs: Self) -> Self::Output {
+        self.add(&rhs)
+    }
+}
+
+impl<const BATCH: usize> Add<&DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchScalar<BATCH>;
+    fn add(self, rhs: &Self) -> Self::Output {
+        let r = self.real_part + rhs.real_part;
+
+        Self {
+            real_part: r,
+            dij_part: Self::binary_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| *l_dij,
+                |r_dij| *r_dij,
+            ),
+        }
+    }
+}
+
+impl<const BATCH: usize> Mul<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchScalar<BATCH>;
+    fn mul(self, rhs: Self) -> Self::Output {
+        self.mul(&rhs)
+    }
+}
+
+impl<const BATCH: usize> Mul<&DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchScalar<BATCH>;
+    fn mul(self, rhs: &Self) -> Self::Output {
+        let r = self.real_part * rhs.real_part;
+
+        Self {
+            real_part: r,
+            dij_part: Self::binary_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| (*l_dij) * rhs.real_part,
+                |r_dij| (*r_dij) * self.real_part,
+            ),
+        }
+    }
+}
+
+impl<const BATCH: usize> Div<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchScalar<BATCH>;
+    fn div(self, rhs: Self) -> Self::Output {
+        self.div(&rhs)
+    }
+}
+
+impl<const BATCH: usize> Div<&DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchScalar<BATCH>;
+    fn div(self, rhs: &Self) -> Self::Output {
+        let rhs_inv = BatchScalarF64::<BATCH>::ones() / rhs.real_part;
+        Self {
+            real_part: self.real_part * rhs_inv,
+            dij_part: Self::binary_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| *l_dij * rhs_inv,
+                |r_dij| -self.real_part * (*r_dij) * rhs_inv * rhs_inv,
+            ),
+        }
+    }
+}
+
+impl<const BATCH: usize> Sub<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchScalar<BATCH>;
+    fn sub(self, rhs: Self) -> Self::Output {
+        self.sub(&rhs)
+    }
+}
+
+impl<const BATCH: usize> Sub<&DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchScalar<BATCH>;
+    fn sub(self, rhs: &Self) -> Self::Output {
+        Self {
+            real_part: self.real_part - rhs.real_part,
+            dij_part: Self::binary_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| *l_dij,
+                |r_dij| -(*r_dij),
+            ),
+        }
+    }
+}
diff --git a/crates/sophus_core/src/calculus/dual/dual_batch_vector.rs b/crates/sophus_core/src/calculus/dual/dual_batch_vector.rs
new file mode 100644
index 0000000..d3c0c7c
--- /dev/null
+++ b/crates/sophus_core/src/calculus/dual/dual_batch_vector.rs
@@ -0,0 +1,526 @@
+pub use crate::calculus::dual::dual_batch_scalar::DualBatchScalar;
+use crate::calculus::dual::dual_vector::DijPair;
+use crate::calculus::dual::DualBatchMatrix;
+use crate::linalg::BatchScalarF64;
+use crate::linalg::BatchVecF64;
+use crate::prelude::*;
+use crate::tensor::mut_tensor::InnerVecToMat;
+use crate::tensor::mut_tensor::MutTensorDD;
+use crate::tensor::mut_tensor::MutTensorDDR;
+use approx::AbsDiffEq;
+use approx::RelativeEq;
+use std::fmt::Debug;
+use std::ops::Add;
+use std::ops::Neg;
+use std::ops::Sub;
+use std::simd::LaneCount;
+use std::simd::Mask;
+use std::simd::SupportedLaneCount;
+
+/// Dual vector (batch version)
+#[derive(Clone, Debug)]
+pub struct DualBatchVector<const ROWS: usize, const BATCH: usize>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    /// real part
+    pub real_part: BatchVecF64<ROWS, BATCH>,
+    /// infinitesimal part - represents derivative
+    pub dij_part: Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>>,
+}
+
+impl<const ROWS: usize, const BATCH: usize> num_traits::Zero for DualBatchVector<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn zero() -> Self {
+        DualBatchVector {
+            real_part: BatchVecF64::<ROWS, BATCH>::zeros(),
+            dij_part: None,
+        }
+    }
+
+    fn is_zero(&self) -> bool {
+        self.real_part == BatchVecF64::<ROWS, BATCH>::zeros()
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> IsDual for DualBatchVector<ROWS, BATCH> where
+    LaneCount<BATCH>: SupportedLaneCount
+{
+}
+
+impl<const ROWS: usize, const BATCH: usize> PartialEq for DualBatchVector<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn eq(&self, other: &Self) -> bool {
+        self.real_part == other.real_part && self.dij_part == other.dij_part
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> AbsDiffEq for DualBatchVector<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Epsilon = f64;
+
+    fn default_epsilon() -> Self::Epsilon {
+        f64::default_epsilon()
+    }
+
+    fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool {
+        self.real_part.abs_diff_eq(&other.real_part, epsilon)
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> RelativeEq for DualBatchVector<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn default_max_relative() -> Self::Epsilon {
+        f64::default_max_relative()
+    }
+
+    fn relative_eq(
+        &self,
+        other: &Self,
+        epsilon: Self::Epsilon,
+        max_relative: Self::Epsilon,
+    ) -> bool {
+        self.real_part
+            .relative_eq(&other.real_part, epsilon, max_relative)
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> IsDualVector<DualBatchScalar<BATCH>, ROWS, BATCH>
+    for DualBatchVector<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn new_with_dij(val: BatchVecF64<ROWS, BATCH>) -> Self {
+        let mut dij_val = MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape([ROWS, 1]);
+        for i in 0..ROWS {
+            dij_val.mut_view().get_mut([i, 0])[(i, 0)] = BatchScalarF64::<BATCH>::ones();
+        }
+
+        Self {
+            real_part: val,
+            dij_part: Some(dij_val),
+        }
+    }
+
+    fn dij_val(self) -> Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>> {
+        self.dij_part
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> DualBatchVector<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn binary_dij<
+        const R0: usize,
+        const R1: usize,
+        F: FnMut(&BatchVecF64<R0, BATCH>) -> BatchVecF64<ROWS, BATCH>,
+        G: FnMut(&BatchVecF64<R1, BATCH>) -> BatchVecF64<ROWS, BATCH>,
+    >(
+        lhs_dx: &Option<MutTensorDDR<BatchScalarF64<BATCH>, R0>>,
+        rhs_dx: &Option<MutTensorDDR<BatchScalarF64<BATCH>, R1>>,
+        mut left_op: F,
+        mut right_op: G,
+    ) -> Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>> {
+        match (lhs_dx, rhs_dx) {
+            (None, None) => None,
+            (None, Some(rhs_dij)) => {
+                let out_dij = MutTensorDDR::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), None) => {
+                let out_dij = MutTensorDDR::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), Some(rhs_dij)) => {
+                let dyn_mat =
+                    MutTensorDDR::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
+                        left_op(l_dij) + right_op(r_dij)
+                    });
+                Some(dyn_mat)
+            }
+        }
+    }
+
+    fn binary_vs_dij<
+        const R0: usize,
+        F: FnMut(&BatchVecF64<R0, BATCH>) -> BatchVecF64<ROWS, BATCH>,
+        G: FnMut(&BatchScalarF64<BATCH>) -> BatchVecF64<ROWS, BATCH>,
+    >(
+        lhs_dx: &Option<MutTensorDDR<BatchScalarF64<BATCH>, R0>>,
+        rhs_dx: &Option<MutTensorDD<BatchScalarF64<BATCH>>>,
+        mut left_op: F,
+        mut right_op: G,
+    ) -> Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>> {
+        match (lhs_dx, rhs_dx) {
+            (None, None) => None,
+            (None, Some(rhs_dij)) => {
+                let out_dij = MutTensorDDR::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), None) => {
+                let out_dij = MutTensorDDR::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
+                Some(out_dij)
+            }
+            (Some(lhs_dij), Some(rhs_dij)) => {
+                let dyn_mat =
+                    MutTensorDDR::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
+                        left_op(l_dij) + right_op(r_dij)
+                    });
+                Some(dyn_mat)
+            }
+        }
+    }
+
+    fn two_dx<const R0: usize, const R1: usize>(
+        mut lhs_dx: Option<MutTensorDDR<BatchScalarF64<BATCH>, R0>>,
+        mut rhs_dx: Option<MutTensorDDR<BatchScalarF64<BATCH>, R1>>,
+    ) -> Option<DijPair<BatchScalarF64<BATCH>, R0, R1>> {
+        if lhs_dx.is_none() && rhs_dx.is_none() {
+            return None;
+        }
+
+        if lhs_dx.is_some() && rhs_dx.is_some() {
+            assert_eq!(
+                lhs_dx.clone().unwrap().dims(),
+                rhs_dx.clone().unwrap().dims()
+            );
+        }
+
+        if lhs_dx.is_none() {
+            lhs_dx = Some(MutTensorDDR::from_shape(rhs_dx.clone().unwrap().dims()))
+        } else if rhs_dx.is_none() {
+            rhs_dx = Some(MutTensorDDR::from_shape(lhs_dx.clone().unwrap().dims()))
+        }
+
+        Some(DijPair {
+            lhs: lhs_dx.unwrap(),
+            rhs: rhs_dx.unwrap(),
+        })
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> Neg for DualBatchVector<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchVector<ROWS, BATCH>;
+
+    fn neg(self) -> Self::Output {
+        DualBatchVector {
+            real_part: -self.real_part,
+            dij_part: self
+                .dij_part
+                .clone()
+                .map(|dij_val| MutTensorDDR::from_map(&dij_val.view(), |v| -v)),
+        }
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> Sub for DualBatchVector<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchVector<ROWS, BATCH>;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        DualBatchVector {
+            real_part: self.real_part - rhs.real_part,
+            dij_part: Self::binary_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| *l_dij,
+                |r_dij| -r_dij,
+            ),
+        }
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> Add for DualBatchVector<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = DualBatchVector<ROWS, BATCH>;
+
+    fn add(self, rhs: Self) -> Self::Output {
+        DualBatchVector {
+            real_part: self.real_part + rhs.real_part,
+            dij_part: Self::binary_dij(
+                &self.dij_part,
+                &rhs.dij_part,
+                |l_dij| *l_dij,
+                |r_dij| *r_dij,
+            ),
+        }
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> IsVector<DualBatchScalar<BATCH>, ROWS, BATCH>
+    for DualBatchVector<ROWS, BATCH>
+where
+    BatchScalarF64<BATCH>: IsCoreScalar,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn from_f64(val: f64) -> Self {
+        DualBatchVector {
+            real_part:
+                <BatchVecF64<ROWS, BATCH> as IsVector<BatchScalarF64<BATCH>, ROWS, BATCH>>::from_f64(
+                    val,
+                ),
+            dij_part: None,
+        }
+    }
+
+    fn outer<const R2: usize>(
+        self,
+        rhs: DualBatchVector<R2, BATCH>,
+    ) -> DualBatchMatrix<ROWS, R2, BATCH> {
+        let mut result = DualBatchMatrix::zeros();
+        for i in 0..ROWS {
+            for j in 0..R2 {
+                result.set_elem([i, j], self.get_elem(i) * rhs.get_elem(j));
+            }
+        }
+        result
+    }
+
+    fn norm(&self) -> DualBatchScalar<BATCH> {
+        self.clone().dot(self.clone()).sqrt()
+    }
+
+    fn squared_norm(&self) -> DualBatchScalar<BATCH> {
+        self.clone().dot(self.clone())
+    }
+
+    fn get_elem(&self, idx: usize) -> DualBatchScalar<BATCH> {
+        DualBatchScalar {
+            real_part: self.real_part[idx],
+            dij_part: self
+                .dij_part
+                .clone()
+                .map(|dij_val| MutTensorDD::from_map(&dij_val.view(), |v| v[idx])),
+        }
+    }
+
+    fn from_array(duals: [DualBatchScalar<BATCH>; ROWS]) -> Self {
+        let mut shape = None;
+        let mut val_v = BatchVecF64::<ROWS, BATCH>::zeros();
+        for i in 0..duals.len() {
+            let d = duals.clone()[i].clone();
+
+            val_v[i] = d.real_part;
+            if d.dij_part.is_some() {
+                shape = Some(d.dij_part.clone().unwrap().dims());
+            }
+        }
+
+        if shape.is_none() {
+            return DualBatchVector {
+                real_part: val_v,
+                dij_part: None,
+            };
+        }
+        let shape = shape.unwrap();
+
+        let mut r = MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape(shape);
+
+        for i in 0..duals.len() {
+            let d = duals.clone()[i].clone();
+            if d.dij_part.is_some() {
+                for d0 in 0..shape[0] {
+                    for d1 in 0..shape[1] {
+                        r.mut_view().get_mut([d0, d1])[(i, 0)] =
+                            d.dij_part.clone().unwrap().get([d0, d1]);
+                    }
+                }
+            }
+        }
+        DualBatchVector {
+            real_part: val_v,
+            dij_part: Some(r),
+        }
+    }
+
+    fn from_real_array(vals: [BatchScalarF64<BATCH>; ROWS]) -> Self {
+        DualBatchVector {
+            real_part: BatchVecF64::from_real_array(vals),
+            dij_part: None,
+        }
+    }
+
+    fn from_real_vector(val: BatchVecF64<ROWS, BATCH>) -> Self {
+        Self {
+            real_part: val,
+            dij_part: None,
+        }
+    }
+
+    fn real_vector(&self) -> &BatchVecF64<ROWS, BATCH> {
+        &self.real_part
+    }
+
+    fn to_mat(self) -> DualBatchMatrix<ROWS, 1, BATCH> {
+        DualBatchMatrix {
+            real_part: self.real_part,
+            dij_part: self.dij_part.map(|dij| dij.inner_vec_to_mat()),
+        }
+    }
+
+    fn block_vec2<const R0: usize, const R1: usize>(
+        top_row: DualBatchVector<R0, BATCH>,
+        bot_row: DualBatchVector<R1, BATCH>,
+    ) -> Self {
+        assert_eq!(R0 + R1, ROWS);
+
+        let maybe_dij = Self::two_dx(top_row.dij_part, bot_row.dij_part);
+        Self {
+            real_part: BatchVecF64::<ROWS, BATCH>::block_vec2(top_row.real_part, bot_row.real_part),
+            dij_part: match maybe_dij {
+                Some(dij_val) => {
+                    let mut r =
+                        MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape(dij_val.shape());
+                    for d0 in 0..dij_val.shape()[0] {
+                        for d1 in 0..dij_val.shape()[1] {
+                            *r.mut_view().get_mut([d0, d1]) =
+                                BatchVecF64::<ROWS, BATCH>::block_vec2(
+                                    dij_val.lhs.get([d0, d1]),
+                                    dij_val.rhs.get([d0, d1]),
+                                );
+                        }
+                    }
+                    Some(r)
+                }
+                None => None,
+            },
+        }
+    }
+
+    fn scaled(&self, s: DualBatchScalar<BATCH>) -> Self {
+        DualBatchVector {
+            real_part: self.real_part * s.real_part,
+            dij_part: Self::binary_vs_dij(
+                &self.dij_part,
+                &s.dij_part,
+                |l_dij| l_dij * s.real_part,
+                |r_dij| self.real_part * *r_dij,
+            ),
+        }
+    }
+
+    fn dot(self, rhs: Self) -> DualBatchScalar<BATCH> {
+        let mut sum = DualBatchScalar::from_f64(0.0);
+
+        for i in 0..ROWS {
+            sum += self.get_elem(i) * rhs.get_elem(i);
+        }
+
+        sum
+    }
+
+    fn normalized(&self) -> Self {
+        self.clone()
+            .scaled(DualBatchScalar::<BATCH>::from_f64(1.0) / self.norm())
+    }
+
+    fn from_f64_array(vals: [f64; ROWS]) -> Self {
+        DualBatchVector {
+            real_part: BatchVecF64::from_f64_array(vals),
+            dij_part: None,
+        }
+    }
+
+    fn from_scalar_array(vals: [DualBatchScalar<BATCH>; ROWS]) -> Self {
+        let mut shape = None;
+        let mut val_v = BatchVecF64::<ROWS, BATCH>::zeros();
+        for i in 0..vals.len() {
+            let d = vals.clone()[i].clone();
+
+            val_v[i] = d.real_part;
+            if d.dij_part.is_some() {
+                shape = Some(d.dij_part.clone().unwrap().dims());
+            }
+        }
+
+        if shape.is_none() {
+            return DualBatchVector {
+                real_part: val_v,
+                dij_part: None,
+            };
+        }
+        let shape = shape.unwrap();
+
+        let mut r = MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape(shape);
+
+        for i in 0..vals.len() {
+            let d = vals.clone()[i].clone();
+            if d.dij_part.is_some() {
+                for d0 in 0..shape[0] {
+                    for d1 in 0..shape[1] {
+                        r.mut_view().get_mut([d0, d1])[(i, 0)] =
+                            d.dij_part.clone().unwrap().get([d0, d1]);
+                    }
+                }
+            }
+        }
+        DualBatchVector {
+            real_part: val_v,
+            dij_part: Some(r),
+        }
+    }
+
+    fn set_elem(&mut self, idx: usize, v: DualBatchScalar<BATCH>) {
+        self.real_part[idx] = v.real_part;
+        if self.dij_part.is_some() {
+            let dij = &mut self.dij_part.as_mut().unwrap();
+            for i in 0..dij.dims()[0] {
+                for j in 0..dij.dims()[1] {
+                    dij.mut_view().get_mut([i, j])[idx] = v.dij_part.clone().unwrap().get([i, j]);
+                }
+            }
+        }
+    }
+
+    fn to_dual(self) -> <DualBatchScalar<BATCH> as IsScalar<BATCH>>::DualVector<ROWS> {
+        self
+    }
+
+    fn select(self, mask: &Mask<i64, BATCH>, other: Self) -> Self {
+        let maybe_dij = Self::two_dx(self.dij_part, other.dij_part);
+
+        Self {
+            real_part: IsVector::select(self.real_part, mask, other.real_part),
+            dij_part: match maybe_dij {
+                Some(dij) => {
+                    let mut r =
+                        MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape(dij.shape());
+                    for i in 0..dij.shape()[0] {
+                        for j in 0..dij.shape()[1] {
+                            *r.get_mut([i, j]) =
+                                IsVector::select(dij.lhs.get([i, j]), mask, dij.rhs.get([i, j]));
+                        }
+                    }
+                    Some(r)
+                }
+                _ => None,
+            },
+        }
+    }
+
+    fn get_fixed_subvec<const R: usize>(&self, start_r: usize) -> DualBatchVector<R, BATCH> {
+        DualBatchVector {
+            real_part: self.real_part.fixed_rows::<R>(start_r).into(),
+            dij_part: self.dij_part.clone().map(|dij_val| {
+                MutTensorDDR::from_map(&dij_val.view(), |v| v.fixed_rows::<R>(start_r).into())
+            }),
+        }
+    }
+}
diff --git a/crates/sophus_core/src/calculus/dual/dual_matrix.rs b/crates/sophus_core/src/calculus/dual/dual_matrix.rs
index a4dbcbb..b323132 100644
--- a/crates/sophus_core/src/calculus/dual/dual_matrix.rs
+++ b/crates/sophus_core/src/calculus/dual/dual_matrix.rs
@@ -1,10 +1,5 @@
-use crate::calculus::dual::DualBatchScalar;
-use crate::calculus::dual::DualBatchVector;
 use crate::calculus::dual::DualScalar;
 use crate::calculus::dual::DualVector;
-use crate::linalg::BatchMatF64;
-use crate::linalg::BatchScalarF64;
-use crate::linalg::BatchVecF64;
 use crate::linalg::MatF64;
 use crate::linalg::VecF64;
 use crate::prelude::*;
@@ -19,9 +14,6 @@ use std::ops::Add;
 use std::ops::Mul;
 use std::ops::Neg;
 use std::ops::Sub;
-use std::simd::LaneCount;
-use std::simd::Mask;
-use std::simd::SupportedLaneCount;
 
 /// DualScalarLike matrix
 #[derive(Clone)]
@@ -32,17 +24,19 @@ pub struct DualMatrix<const ROWS: usize, const COLS: usize> {
     pub dij_part: Option<MutTensorDDRC<f64, ROWS, COLS>>,
 }
 
-/// DualScalarLike matrix
-#[derive(Clone)]
-pub struct DualBatchMatrix<const ROWS: usize, const COLS: usize, const BATCH: usize>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    /// real part
-    pub real_part: BatchMatF64<ROWS, COLS, BATCH>,
-    /// infinitesimal part - represents derivative
-    pub dij_part: Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>>,
+impl<const ROWS: usize, const COLS: usize> Debug for DualMatrix<ROWS, COLS> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.dij_part.is_some() {
+            f.debug_struct("DualScalarLike")
+                .field("val", &self.real_part)
+                .field("dij_val", &self.dij_part.as_ref().unwrap().elem_view())
+                .finish()
+        } else {
+            f.debug_struct("DualScalarLike")
+                .field("val", &self.real_part)
+                .finish()
+        }
+    }
 }
 
 impl<const ROWS: usize, const COLS: usize> IsSingleMatrix<DualScalar, ROWS, COLS>
@@ -105,8 +99,8 @@ pub struct DijPairM<
     const ROWS2: usize,
     const COLS2: usize,
 > {
-    lhs: MutTensorDDRC<S, ROWS, COLS>,
-    rhs: MutTensorDDRC<S, ROWS2, COLS2>,
+    pub(crate) lhs: MutTensorDDRC<S, ROWS, COLS>,
+    pub(crate) rhs: MutTensorDDRC<S, ROWS2, COLS2>,
 }
 
 /// Pair of dual matrices
@@ -125,7 +119,7 @@ impl<
         const COLS2: usize,
     > DijPairM<S, ROWS, COLS, ROWS2, COLS2>
 {
-    fn shape(&self) -> [usize; 2] {
+    pub(crate) fn shape(&self) -> [usize; 2] {
         self.lhs.dims()
     }
 }
@@ -656,684 +650,14 @@ impl<const ROWS: usize, const COLS: usize> Mul<DualVector<COLS>> for DualMatrix<
     }
 }
 
-impl<const ROWS: usize, const COLS: usize> Debug for DualMatrix<ROWS, COLS> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.dij_part.is_some() {
-            f.debug_struct("DualScalarLike")
-                .field("val", &self.real_part)
-                .field("dij_val", &self.dij_part.as_ref().unwrap().elem_view())
-                .finish()
-        } else {
-            f.debug_struct("DualScalarLike")
-                .field("val", &self.real_part)
-                .finish()
-        }
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> IsDual
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize>
-    IsDualMatrix<DualBatchScalar<BATCH>, ROWS, COLS, BATCH> for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    /// Create a new dual number
-    fn new_with_dij(val: BatchMatF64<ROWS, COLS, BATCH>) -> Self {
-        let mut dij_val =
-            MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape([ROWS, COLS]);
-        for i in 0..ROWS {
-            for j in 0..COLS {
-                dij_val.mut_view().get_mut([i, j])[(i, j)] = BatchScalarF64::<BATCH>::from_f64(1.0);
-            }
-        }
-
-        Self {
-            real_part: val,
-            dij_part: Some(dij_val),
-        }
-    }
-
-    /// Get the derivative
-    fn dij_val(self) -> Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>> {
-        self.dij_part
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn binary_mm_dij<
-        const R0: usize,
-        const R1: usize,
-        const C0: usize,
-        const C1: usize,
-        F: FnMut(&BatchMatF64<R0, C0, BATCH>) -> BatchMatF64<ROWS, COLS, BATCH>,
-        G: FnMut(&BatchMatF64<R1, C1, BATCH>) -> BatchMatF64<ROWS, COLS, BATCH>,
-    >(
-        lhs_dx: &Option<MutTensorDDRC<BatchScalarF64<BATCH>, R0, C0>>,
-        rhs_dx: &Option<MutTensorDDRC<BatchScalarF64<BATCH>, R1, C1>>,
-        mut left_op: F,
-        mut right_op: G,
-    ) -> Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>> {
-        match (lhs_dx, rhs_dx) {
-            (None, None) => None,
-            (None, Some(rhs_dij)) => {
-                let out_dij = MutTensorDDRC::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), None) => {
-                let out_dij = MutTensorDDRC::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), Some(rhs_dij)) => {
-                let dyn_mat =
-                    MutTensorDDRC::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
-                        left_op(l_dij) + right_op(r_dij)
-                    });
-                Some(dyn_mat)
-            }
-        }
-    }
-
-    fn binary_mv_dij<
-        const R0: usize,
-        const R1: usize,
-        const C0: usize,
-        F: FnMut(&BatchMatF64<R0, C0, BATCH>) -> BatchVecF64<ROWS, BATCH>,
-        G: FnMut(&BatchVecF64<R1, BATCH>) -> BatchVecF64<ROWS, BATCH>,
-    >(
-        lhs_dx: &Option<MutTensorDDRC<BatchScalarF64<BATCH>, R0, C0>>,
-        rhs_dx: &Option<MutTensorDDR<BatchScalarF64<BATCH>, R1>>,
-        mut left_op: F,
-        mut right_op: G,
-    ) -> Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>> {
-        match (lhs_dx, rhs_dx) {
-            (None, None) => None,
-            (None, Some(rhs_dij)) => {
-                let out_dij = MutTensorDDR::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), None) => {
-                let out_dij = MutTensorDDR::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), Some(rhs_dij)) => {
-                let dyn_mat =
-                    MutTensorDDR::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
-                        left_op(l_dij) + right_op(r_dij)
-                    });
-                Some(dyn_mat)
-            }
-        }
-    }
-
-    fn binary_ms_dij<
-        const R0: usize,
-        const C0: usize,
-        F: FnMut(&BatchMatF64<R0, C0, BATCH>) -> BatchMatF64<ROWS, COLS, BATCH>,
-        G: FnMut(&BatchScalarF64<BATCH>) -> BatchMatF64<ROWS, COLS, BATCH>,
-    >(
-        lhs_dx: &Option<MutTensorDDRC<BatchScalarF64<BATCH>, R0, C0>>,
-        rhs_dx: &Option<MutTensorDD<BatchScalarF64<BATCH>>>,
-        mut left_op: F,
-        mut right_op: G,
-    ) -> Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>> {
-        match (lhs_dx, rhs_dx) {
-            (None, None) => None,
-            (None, Some(rhs_dij)) => {
-                let out_dij = MutTensorDDRC::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), None) => {
-                let out_dij = MutTensorDDRC::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), Some(rhs_dij)) => {
-                let dyn_mat =
-                    MutTensorDDRC::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
-                        left_op(l_dij) + right_op(r_dij)
-                    });
-                Some(dyn_mat)
-            }
-        }
-    }
-
-    /// derivatives
-    pub fn two_dx<const R1: usize, const C1: usize, const R2: usize, const C2: usize>(
-        mut lhs_dx: Option<MutTensorDDRC<BatchScalarF64<BATCH>, R1, C1>>,
-        mut rhs_dx: Option<MutTensorDDRC<BatchScalarF64<BATCH>, R2, C2>>,
-    ) -> Option<DijPairM<BatchScalarF64<BATCH>, R1, C1, R2, C2>> {
-        if lhs_dx.is_none() && rhs_dx.is_none() {
-            return None;
-        }
-
-        if lhs_dx.is_some() && rhs_dx.is_some() {
-            assert_eq!(
-                lhs_dx.clone().unwrap().dims(),
-                rhs_dx.clone().unwrap().dims()
-            );
-        }
-
-        if lhs_dx.is_none() {
-            lhs_dx = Some(MutTensorDDRC::<BatchScalarF64<BATCH>, R1, C1>::from_shape(
-                rhs_dx.clone().unwrap().dims(),
-            ))
-        } else if rhs_dx.is_none() {
-            rhs_dx = Some(MutTensorDDRC::<BatchScalarF64<BATCH>, R2, C2>::from_shape(
-                lhs_dx.clone().unwrap().dims(),
-            ))
-        }
-
-        Some(DijPairM {
-            lhs: lhs_dx.unwrap(),
-            rhs: rhs_dx.unwrap(),
-        })
-    }
-
-    /// derivatives
-    pub fn two_dx_from_vec(
-        mut lhs_dx: Option<MutTensorDDRC<BatchScalarF64<BATCH>, ROWS, COLS>>,
-        mut rhs_dx: Option<MutTensorDDR<BatchScalarF64<BATCH>, COLS>>,
-    ) -> Option<DijPairMV<BatchScalarF64<BATCH>, ROWS, COLS>> {
-        if lhs_dx.is_none() && rhs_dx.is_none() {
-            return None;
-        }
-
-        if lhs_dx.is_some() && rhs_dx.is_some() {
-            assert_eq!(
-                lhs_dx.clone().unwrap().dims(),
-                rhs_dx.clone().unwrap().dims()
-            );
-        }
-
-        if lhs_dx.is_none() {
-            lhs_dx = Some(
-                MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(
-                    rhs_dx.clone().unwrap().dims(),
-                ),
-            )
-        } else if rhs_dx.is_none() {
-            rhs_dx = Some(MutTensorDDR::<BatchScalarF64<BATCH>, COLS>::from_shape(
-                lhs_dx.clone().unwrap().dims(),
-            ))
-        }
-
-        Some(DijPairMV::<BatchScalarF64<BATCH>, ROWS, COLS> {
-            lhs: lhs_dx.unwrap(),
-            rhs: rhs_dx.unwrap(),
-        })
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> PartialEq
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn eq(&self, other: &Self) -> bool {
-        self.real_part == other.real_part && self.dij_part == other.dij_part
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> AbsDiffEq
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Epsilon = f64;
-
-    fn default_epsilon() -> Self::Epsilon {
-        f64::default_epsilon()
-    }
-
-    fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool {
-        self.real_part.abs_diff_eq(&other.real_part, epsilon)
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> RelativeEq
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn default_max_relative() -> Self::Epsilon {
-        f64::default_max_relative()
-    }
-
-    fn relative_eq(
-        &self,
-        other: &Self,
-        epsilon: Self::Epsilon,
-        max_relative: Self::Epsilon,
-    ) -> bool {
-        self.real_part
-            .relative_eq(&other.real_part, epsilon, max_relative)
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize>
-    IsMatrix<DualBatchScalar<BATCH>, ROWS, COLS, BATCH> for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn from_f64(val: f64) -> Self {
-        DualBatchMatrix {
-            real_part: BatchMatF64::<ROWS, COLS, BATCH>::from_f64(val),
-            dij_part: None,
-        }
-    }
-
-    fn set_elem(&mut self, idx: [usize; 2], val: DualBatchScalar<BATCH>) {
-        self.real_part.set_elem(idx, val.real_part);
-        if self.dij_part.is_some() {
-            let dij = &mut self.dij_part.as_mut().unwrap();
-            for i in 0..dij.dims()[0] {
-                for j in 0..dij.dims()[1] {
-                    dij.mut_view().get_mut([i, j])[(idx[0], idx[1])] =
-                        val.dij_part.clone().unwrap().get([i, j]);
-                }
-            }
-        }
-    }
-
-    fn from_scalar(val: DualBatchScalar<BATCH>) -> Self {
-        DualBatchMatrix {
-            real_part: BatchMatF64::<ROWS, COLS, BATCH>::from_scalar(val.real_part),
-            dij_part: val.dij_part.map(|dij_val| {
-                MutTensorDDRC::from_map(&dij_val.view(), |v| {
-                    BatchMatF64::<ROWS, COLS, BATCH>::from_scalar(*v)
-                })
-            }),
-        }
-    }
-
-    fn mat_mul<const COLS2: usize>(
-        &self,
-        rhs: DualBatchMatrix<COLS, COLS2, BATCH>,
-    ) -> DualBatchMatrix<ROWS, COLS2, BATCH> {
-        DualBatchMatrix {
-            real_part: self.real_part * rhs.real_part,
-            dij_part: DualBatchMatrix::<ROWS, COLS2, BATCH>::binary_mm_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| l_dij * rhs.real_part,
-                |r_dij| self.real_part * r_dij,
-            ),
-        }
-    }
-
-    fn from_real_matrix(val: BatchMatF64<ROWS, COLS, BATCH>) -> Self {
-        Self {
-            real_part: val,
-            dij_part: None,
-        }
-    }
-
-    fn scaled(&self, s: DualBatchScalar<BATCH>) -> Self {
-        DualBatchMatrix {
-            real_part: self.real_part * s.real_part,
-            dij_part: DualBatchMatrix::<ROWS, COLS, BATCH>::binary_ms_dij(
-                &self.dij_part,
-                &s.dij_part,
-                |l_dij| l_dij * s.real_part,
-                |r_dij| self.real_part * *r_dij,
-            ),
-        }
-    }
-
-    fn identity() -> Self {
-        DualBatchMatrix::from_real_matrix(BatchMatF64::<ROWS, COLS, BATCH>::identity())
-    }
-
-    fn get_elem(&self, idx: [usize; 2]) -> DualBatchScalar<BATCH> {
-        DualBatchScalar::<BATCH> {
-            real_part: self.real_part.get_elem(idx),
-            dij_part: self
-                .dij_part
-                .clone()
-                .map(|dij_val| MutTensorDD::from_map(&dij_val.view(), |v| v[(idx[0], idx[1])])),
-        }
-    }
-
-    fn from_array2(duals: [[DualBatchScalar<BATCH>; COLS]; ROWS]) -> Self {
-        let mut shape = None;
-        let mut val_mat = BatchMatF64::<ROWS, COLS, BATCH>::zeros();
-        for i in 0..duals.len() {
-            let d_rows = duals[i].clone();
-            for j in 0..d_rows.len() {
-                let d = d_rows.clone()[j].clone();
-
-                val_mat[(i, j)] = d.real_part;
-                if d.dij_part.is_some() {
-                    shape = Some(d.dij_part.clone().unwrap().dims());
-                }
-            }
-        }
-
-        if shape.is_none() {
-            return DualBatchMatrix {
-                real_part: val_mat,
-                dij_part: None,
-            };
-        }
-        let shape = shape.unwrap();
-
-        let mut r = MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(shape);
-        for i in 0..duals.len() {
-            let d_rows = duals[i].clone();
-            for j in 0..d_rows.len() {
-                let d = d_rows.clone()[j].clone();
-                if d.dij_part.is_some() {
-                    for d0 in 0..shape[0] {
-                        for d1 in 0..shape[1] {
-                            r.mut_view().get_mut([d0, d1])[(i, j)] =
-                                d.dij_part.clone().unwrap().get([d0, d1]);
-                        }
-                    }
-                }
-            }
-        }
-        DualBatchMatrix {
-            real_part: val_mat,
-            dij_part: Some(r),
-        }
-    }
-
-    fn real_matrix(&self) -> &BatchMatF64<ROWS, COLS, BATCH> {
-        &self.real_part
-    }
-
-    fn block_mat2x2<const R0: usize, const R1: usize, const C0: usize, const C1: usize>(
-        top_row: (
-            DualBatchMatrix<R0, C0, BATCH>,
-            DualBatchMatrix<R0, C1, BATCH>,
-        ),
-        bot_row: (
-            DualBatchMatrix<R1, C0, BATCH>,
-            DualBatchMatrix<R1, C1, BATCH>,
-        ),
-    ) -> Self {
-        assert_eq!(R0 + R1, ROWS);
-        assert_eq!(C0 + C1, COLS);
-
-        Self::block_mat2x1(
-            DualBatchMatrix::<R0, COLS, BATCH>::block_mat1x2(top_row.0, top_row.1),
-            DualBatchMatrix::<R1, COLS, BATCH>::block_mat1x2(bot_row.0, bot_row.1),
-        )
-    }
-
-    fn block_mat2x1<const R0: usize, const R1: usize>(
-        top_row: DualBatchMatrix<R0, COLS, BATCH>,
-        bot_row: DualBatchMatrix<R1, COLS, BATCH>,
-    ) -> Self {
-        assert_eq!(R0 + R1, ROWS);
-        let maybe_dij = Self::two_dx(top_row.dij_part, bot_row.dij_part);
-
-        Self {
-            real_part: BatchMatF64::<ROWS, COLS, BATCH>::block_mat2x1(
-                top_row.real_part,
-                bot_row.real_part,
-            ),
-            dij_part: match maybe_dij {
-                Some(dij_val) => {
-                    let mut r = MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(
-                        dij_val.shape(),
-                    );
-                    for d0 in 0..dij_val.shape()[0] {
-                        for d1 in 0..dij_val.shape()[1] {
-                            *r.mut_view().get_mut([d0, d1]) =
-                                BatchMatF64::<ROWS, COLS, BATCH>::block_mat2x1(
-                                    dij_val.lhs.get([d0, d1]),
-                                    dij_val.rhs.get([d0, d1]),
-                                );
-                        }
-                    }
-                    Some(r)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn block_mat1x2<const C0: usize, const C1: usize>(
-        left_col: DualBatchMatrix<ROWS, C0, BATCH>,
-        righ_col: DualBatchMatrix<ROWS, C1, BATCH>,
-    ) -> Self {
-        assert_eq!(C0 + C1, COLS);
-        let maybe_dij = Self::two_dx(left_col.dij_part, righ_col.dij_part);
-
-        Self {
-            real_part: BatchMatF64::<ROWS, COLS, BATCH>::block_mat1x2(
-                left_col.real_part,
-                righ_col.real_part,
-            ),
-            dij_part: match maybe_dij {
-                Some(dij_val) => {
-                    let mut r = MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(
-                        dij_val.shape(),
-                    );
-                    for d0 in 0..dij_val.shape()[0] {
-                        for d1 in 0..dij_val.shape()[1] {
-                            *r.mut_view().get_mut([d0, d1]) =
-                                BatchMatF64::<ROWS, COLS, BATCH>::block_mat1x2(
-                                    dij_val.lhs.get([d0, d1]),
-                                    dij_val.rhs.get([d0, d1]),
-                                );
-                        }
-                    }
-                    Some(r)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn get_fixed_submat<const R: usize, const C: usize>(
-        &self,
-        start_r: usize,
-        start_c: usize,
-    ) -> DualBatchMatrix<R, C, BATCH> {
-        DualBatchMatrix {
-            real_part: self.real_part.get_fixed_submat(start_r, start_c),
-            dij_part: self.dij_part.clone().map(|dij_val| {
-                MutTensorDDRC::from_map(&dij_val.view(), |v| v.get_fixed_submat(start_r, start_c))
-            }),
-        }
-    }
-
-    fn get_col_vec(&self, start_r: usize) -> DualBatchVector<ROWS, BATCH> {
-        DualBatchVector {
-            real_part: self.real_part.get_col_vec(start_r),
-            dij_part: self
-                .dij_part
-                .clone()
-                .map(|dij_val| MutTensorDDR::from_map(&dij_val.view(), |v| v.get_col_vec(start_r))),
-        }
-    }
-
-    fn get_row_vec(&self, c: usize) -> DualBatchVector<COLS, BATCH> {
-        DualBatchVector {
-            real_part: self.real_part.get_row_vec(c),
-            dij_part: self
-                .dij_part
-                .clone()
-                .map(|dij_val| MutTensorDDR::from_map(&dij_val.view(), |v| v.get_row_vec(c))),
-        }
-    }
-
-    fn from_real_scalar_array2(vals: [[BatchScalarF64<BATCH>; COLS]; ROWS]) -> Self {
-        DualBatchMatrix {
-            real_part: BatchMatF64::from_real_scalar_array2(vals),
-            dij_part: None,
-        }
-    }
-
-    fn from_f64_array2(vals: [[f64; COLS]; ROWS]) -> Self {
-        DualBatchMatrix {
-            real_part: BatchMatF64::from_f64_array2(vals),
-            dij_part: None,
-        }
-    }
-
-    fn set_col_vec(
-        &mut self,
-        c: usize,
-        v: <DualBatchScalar<BATCH> as IsScalar<BATCH>>::Vector<ROWS>,
-    ) {
-        self.real_part.set_col_vec(c, v.real_part);
-        todo!();
-    }
-
-    fn to_dual(self) -> <DualBatchScalar<BATCH> as IsScalar<BATCH>>::DualMatrix<ROWS, COLS> {
-        self
-    }
-
-    fn select(self, mask: &Mask<i64, BATCH>, other: Self) -> Self {
-        let maybe_dij = Self::two_dx(self.dij_part, other.dij_part);
-
-        DualBatchMatrix {
-            real_part: self.real_part.select(mask, other.real_part),
-            dij_part: match maybe_dij {
-                Some(dij) => {
-                    let mut r =
-                        MutTensorDDRC::<BatchScalarF64<BATCH>, ROWS, COLS>::from_shape(dij.shape());
-                    for i in 0..dij.shape()[0] {
-                        for j in 0..dij.shape()[1] {
-                            *r.get_mut([i, j]) =
-                                dij.lhs.get([i, j]).select(mask, dij.rhs.get([i, j]));
-                        }
-                    }
-                    Some(r)
-                }
-                _ => None,
-            },
-        }
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Add
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchMatrix<ROWS, COLS, BATCH>;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        DualBatchMatrix {
-            real_part: self.real_part + rhs.real_part,
-            dij_part: Self::binary_mm_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| *l_dij,
-                |r_dij| *r_dij,
-            ),
-        }
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Sub
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchMatrix<ROWS, COLS, BATCH>;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        DualBatchMatrix {
-            real_part: self.real_part - rhs.real_part,
-            dij_part: Self::binary_mm_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| *l_dij,
-                |r_dij| -r_dij,
-            ),
-        }
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Neg
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchMatrix<ROWS, COLS, BATCH>;
-
-    fn neg(self) -> Self::Output {
-        DualBatchMatrix {
-            real_part: -self.real_part,
-            dij_part: self
-                .dij_part
-                .clone()
-                .map(|dij_val| MutTensorDDRC::from_map(&dij_val.view(), |v| -v)),
-        }
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Zero
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn zero() -> Self {
-        Self::from_real_matrix(BatchMatF64::zeros())
-    }
-
-    fn is_zero(&self) -> bool {
-        self.real_part.is_zero()
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Mul<DualBatchVector<COLS, BATCH>>
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchVector<ROWS, BATCH>;
-
-    fn mul(self, rhs: DualBatchVector<COLS, BATCH>) -> Self::Output {
-        Self::Output {
-            real_part: self.real_part * rhs.real_part,
-            dij_part: Self::binary_mv_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| l_dij * rhs.real_part,
-                |r_dij| self.real_part * r_dij,
-            ),
-        }
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize> Debug
-    for DualBatchMatrix<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.dij_part.is_some() {
-            f.debug_struct("DualScalarLike")
-                .field("val", &self.real_part)
-                .field("dij_val", &self.dij_part.as_ref().unwrap().elem_view())
-                .finish()
-        } else {
-            f.debug_struct("DualScalarLike")
-                .field("val", &self.real_part)
-                .finish()
-        }
-    }
-}
-
 #[test]
 fn dual_matrix_tests() {
-    use crate::calculus::dual::dual_scalar::DualBatchScalar;
-    use crate::calculus::dual::dual_scalar::DualScalar;
+    use crate::calculus::dual::DualScalar;
     use crate::calculus::maps::matrix_valued_maps::MatrixValuedMapFromMatrix;
+
+    #[cfg(feature = "simd")]
+    use crate::calculus::dual::DualBatchScalar;
+    #[cfg(feature = "simd")]
     use crate::linalg::BatchScalarF64;
 
     #[cfg(test)]
@@ -1454,10 +778,14 @@ fn dual_matrix_tests() {
     }
 
     def_test_template!(f64, DualScalar, 1);
+    #[cfg(feature = "simd")]
     def_test_template!(BatchScalarF64<2>, DualBatchScalar<2>, 2);
+    #[cfg(feature = "simd")]
     def_test_template!(BatchScalarF64<4>, DualBatchScalar<4>, 4);
 
     f64::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<2>::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<4>::run();
 }
diff --git a/crates/sophus_core/src/calculus/dual/dual_scalar.rs b/crates/sophus_core/src/calculus/dual/dual_scalar.rs
index 711d545..22ec263 100644
--- a/crates/sophus_core/src/calculus/dual/dual_scalar.rs
+++ b/crates/sophus_core/src/calculus/dual/dual_scalar.rs
@@ -1,21 +1,19 @@
-use super::dual_matrix::DualBatchMatrix;
 use super::dual_matrix::DualMatrix;
-use super::dual_vector::DualBatchVector;
 use super::dual_vector::DualVector;
+
 use crate::linalg::scalar::NumberCategory;
-use crate::linalg::BatchMatF64;
-use crate::linalg::BatchScalarF64;
-use crate::linalg::BatchVecF64;
+
 use crate::linalg::MatF64;
 use crate::linalg::VecF64;
 use crate::prelude::*;
 use crate::tensor::mut_tensor::InnerScalarToVec;
 use crate::tensor::mut_tensor::MutTensorDD;
-use approx::assert_abs_diff_eq;
+
 use approx::AbsDiffEq;
 use approx::RelativeEq;
 use num_traits::One;
 use num_traits::Zero;
+
 use std::fmt::Debug;
 use std::ops::Add;
 use std::ops::AddAssign;
@@ -24,9 +22,6 @@ use std::ops::Mul;
 use std::ops::Neg;
 use std::ops::Sub;
 use std::ops::SubAssign;
-use std::simd::LaneCount;
-use std::simd::Mask;
-use std::simd::SupportedLaneCount;
 
 /// Trait for dual numbers
 pub trait IsDual {}
@@ -43,27 +38,6 @@ pub struct DualScalar {
 
 impl IsDual for DualScalar {}
 
-/// Dual number - a real number and an infinitesimal number (batch version)
-#[derive(Clone)]
-pub struct DualBatchScalar<const BATCH: usize>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    /// real part
-    pub real_part: BatchScalarF64<BATCH>,
-
-    /// infinitesimal part - represents derivative
-    pub dij_part: Option<MutTensorDD<BatchScalarF64<BATCH>>>,
-}
-
-impl<const BATCH: usize> IsDual for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-}
-
 /// Trait for scalar dual numbers
 pub trait IsDualScalar<const BATCH: usize>: IsScalar<BATCH, DualScalar = Self> + IsDual {
     /// Create a new dual scalar from real scalar for auto-differentiation with respect to self
@@ -633,703 +607,15 @@ impl Sub<&DualScalar> for DualScalar {
         }
     }
 }
-
-impl<const BATCH: usize> IsDualScalar<BATCH> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn new_with_dij(val: BatchScalarF64<BATCH>) -> Self {
-        let dij_val = MutTensorDD::from_shape_and_val([1, 1], BatchScalarF64::<BATCH>::ones());
-        Self {
-            real_part: val,
-            dij_part: Some(dij_val),
-        }
-    }
-
-    fn dij_val(self) -> Option<MutTensorDD<BatchScalarF64<BATCH>>> {
-        self.dij_part
-    }
-
-    fn vector_with_dij<const ROWS: usize>(val: Self::RealVector<ROWS>) -> Self::Vector<ROWS> {
-        DualBatchVector::<ROWS, BATCH>::new_with_dij(val)
-    }
-
-    fn matrix_with_dij<const ROWS: usize, const COLS: usize>(
-        val: Self::RealMatrix<ROWS, COLS>,
-    ) -> Self::Matrix<ROWS, COLS> {
-        DualBatchMatrix::<ROWS, COLS, BATCH>::new_with_dij(val)
-    }
-}
-
-impl<const BATCH: usize> IsCoreScalar for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn number_category() -> NumberCategory {
-        NumberCategory::Real
-    }
-}
-
-impl<const BATCH: usize> AsRef<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn as_ref(&self) -> &DualBatchScalar<BATCH>
-    where
-        BatchScalarF64<BATCH>: IsCoreScalar,
-        LaneCount<BATCH>: SupportedLaneCount,
-    {
-        self
-    }
-}
-
-impl<const BATCH: usize> One for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn one() -> Self {
-        <DualBatchScalar<BATCH>>::from_f64(1.0)
-    }
-}
-
-impl<const BATCH: usize> Zero for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn zero() -> Self {
-        <DualBatchScalar<BATCH>>::from_f64(0.0)
-    }
-
-    fn is_zero(&self) -> bool {
-        self.real_part == <DualBatchScalar<BATCH>>::from_f64(0.0).real_part()
-    }
-}
-
-impl<const BATCH: usize> DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn binary_dij<
-        F: FnMut(&BatchScalarF64<BATCH>) -> BatchScalarF64<BATCH>,
-        G: FnMut(&BatchScalarF64<BATCH>) -> BatchScalarF64<BATCH>,
-    >(
-        lhs_dx: &Option<MutTensorDD<BatchScalarF64<BATCH>>>,
-        rhs_dx: &Option<MutTensorDD<BatchScalarF64<BATCH>>>,
-        mut left_op: F,
-        mut right_op: G,
-    ) -> Option<MutTensorDD<BatchScalarF64<BATCH>>> {
-        match (lhs_dx, rhs_dx) {
-            (None, None) => None,
-            (None, Some(rhs_dij)) => {
-                let out_dij =
-                    MutTensorDD::from_map(&rhs_dij.view(), |r_dij: &BatchScalarF64<BATCH>| {
-                        right_op(r_dij)
-                    });
-                Some(out_dij)
-            }
-            (Some(lhs_dij), None) => {
-                let out_dij =
-                    MutTensorDD::from_map(&lhs_dij.view(), |l_dij: &BatchScalarF64<BATCH>| {
-                        left_op(l_dij)
-                    });
-                Some(out_dij)
-            }
-            (Some(lhs_dij), Some(rhs_dij)) => {
-                let dyn_mat = MutTensorDD::from_map2(
-                    &lhs_dij.view(),
-                    &rhs_dij.view(),
-                    |l_dij: &BatchScalarF64<BATCH>, r_dij: &BatchScalarF64<BATCH>| {
-                        left_op(l_dij) + right_op(r_dij)
-                    },
-                );
-                Some(dyn_mat)
-            }
-        }
-    }
-}
-
-impl<const BATCH: usize> Neg for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchScalar<BATCH>;
-
-    fn neg(self) -> Self {
-        Self {
-            real_part: -self.real_part,
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |v: &BatchScalarF64<BATCH>| -*v);
-
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-}
-
-impl<const BATCH: usize> PartialEq for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn eq(&self, other: &Self) -> bool {
-        self.real_part == other.real_part
-    }
-}
-
-impl<const BATCH: usize> AbsDiffEq for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Epsilon = f64;
-
-    fn default_epsilon() -> Self::Epsilon {
-        f64::default_epsilon()
-    }
-
-    fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool {
-        for i in 0..BATCH {
-            if !self.real_part.extract_single(i).abs_diff_eq(
-                &other.real_part.extract_single(i),
-                epsilon.extract_single(i),
-            ) {
-                return false;
-            }
-        }
-        true
-    }
-}
-
-impl<const BATCH: usize> RelativeEq for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn default_max_relative() -> Self::Epsilon {
-        f64::default_max_relative()
-    }
-
-    fn relative_eq(
-        &self,
-        other: &Self,
-        epsilon: Self::Epsilon,
-        max_relative: Self::Epsilon,
-    ) -> bool {
-        for i in 0..BATCH {
-            if !self.real_part.extract_single(i).relative_eq(
-                &other.real_part.extract_single(i),
-                epsilon.extract_single(i),
-                max_relative.extract_single(i),
-            ) {
-                return false;
-            }
-        }
-        true
-    }
-}
-
-impl<const BATCH: usize> From<BatchScalarF64<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn from(value: BatchScalarF64<BATCH>) -> Self {
-        Self::from_real_scalar(value)
-    }
-}
-
-impl<const BATCH: usize> Debug for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.dij_part.is_some() {
-            f.debug_struct("DualScalar")
-                .field("val", &self.real_part)
-                .field("dij_val", &self.dij_part.as_ref().unwrap().elem_view())
-                .finish()
-        } else {
-            f.debug_struct("DualScalar")
-                .field("val", &self.real_part)
-                .finish()
-        }
-    }
-}
-
-impl<const BATCH: usize> IsScalar<BATCH> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Scalar = DualBatchScalar<BATCH>;
-    type RealScalar = BatchScalarF64<BATCH>;
-    type SingleScalar = DualScalar;
-    type DualVector<const ROWS: usize> = DualBatchVector<ROWS, BATCH>;
-    type DualMatrix<const ROWS: usize, const COLS: usize> = DualBatchMatrix<ROWS, COLS, BATCH>;
-    type RealVector<const ROWS: usize> = BatchVecF64<ROWS, BATCH>;
-    type RealMatrix<const ROWS: usize, const COLS: usize> = BatchMatF64<ROWS, COLS, BATCH>;
-    type Vector<const ROWS: usize> = DualBatchVector<ROWS, BATCH>;
-    type Matrix<const ROWS: usize, const COLS: usize> = DualBatchMatrix<ROWS, COLS, BATCH>;
-
-    type Mask = Mask<i64, BATCH>;
-
-    fn from_real_scalar(val: BatchScalarF64<BATCH>) -> Self {
-        Self {
-            real_part: val,
-            dij_part: None,
-        }
-    }
-
-    fn scalar_examples() -> Vec<Self> {
-        [1.0, 2.0, 3.0].iter().map(|&v| Self::from_f64(v)).collect()
-    }
-
-    fn from_real_array(arr: [f64; BATCH]) -> Self {
-        Self::from_real_scalar(BatchScalarF64::<BATCH>::from_real_array(arr))
-    }
-
-    fn to_real_array(&self) -> [f64; BATCH] {
-        self.real_part.to_real_array()
-    }
-
-    fn extract_single(&self, i: usize) -> Self::SingleScalar {
-        Self::SingleScalar {
-            real_part: self.real_part.extract_single(i),
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
-                            dij.extract_single(i)
-                        });
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn cos(self) -> DualBatchScalar<BATCH>
-    where
-        BatchScalarF64<BATCH>: IsCoreScalar,
-        LaneCount<BATCH>: SupportedLaneCount,
-    {
-        Self {
-            real_part: self.real_part.cos(),
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
-                            -*dij * self.real_part.sin()
-                        });
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn signum(&self) -> Self {
-        Self {
-            real_part: self.real_part.signum(),
-            dij_part: None,
-        }
-    }
-
-    fn sin(self) -> DualBatchScalar<BATCH>
-    where
-        BatchScalarF64<BATCH>: IsCoreScalar,
-        LaneCount<BATCH>: SupportedLaneCount,
-    {
-        Self {
-            real_part: self.real_part.sin(),
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
-                            *dij * self.real_part.cos()
-                        });
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn abs(self) -> Self {
-        Self {
-            real_part: self.real_part.abs(),
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
-                            *dij * self.real_part.signum()
-                        });
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn atan2(self, rhs: Self) -> Self {
-        let inv_sq_nrm: BatchScalarF64<BATCH> = BatchScalarF64::ones()
-            / (self.real_part * self.real_part + rhs.real_part * rhs.real_part);
-        Self {
-            real_part: self.real_part.atan2(rhs.real_part),
-            dij_part: Self::binary_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| inv_sq_nrm * ((*l_dij) * rhs.real_part),
-                |r_dij| -inv_sq_nrm * (self.real_part * (*r_dij)),
-            ),
-        }
-    }
-
-    fn real_part(&self) -> BatchScalarF64<BATCH> {
-        self.real_part
-    }
-
-    fn sqrt(self) -> Self {
-        let sqrt = self.real_part.sqrt();
-        Self {
-            real_part: sqrt,
-            dij_part: match self.dij_part {
-                Some(dij) => {
-                    let out_dij =
-                        MutTensorDD::from_map(&dij.view(), |dij: &BatchScalarF64<BATCH>| {
-                            *dij * BatchScalarF64::<BATCH>::from_f64(1.0)
-                                / (BatchScalarF64::<BATCH>::from_f64(2.0) * sqrt)
-                        });
-                    Some(out_dij)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn to_vec(self) -> DualBatchVector<1, BATCH> {
-        DualBatchVector::<1, BATCH> {
-            real_part: self.real_part.real_part().to_vec(),
-            dij_part: match self.dij_part {
-                Some(dij) => {
-                    let tmp = dij.inner_scalar_to_vec();
-                    Some(tmp)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn tan(self) -> Self {
-        Self {
-            real_part: self.real_part.tan(),
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let c = self.real_part.cos();
-                    let sec_squared = BatchScalarF64::<BATCH>::ones() / (c * c);
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
-                            *dij * sec_squared
-                        });
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn acos(self) -> Self {
-        Self {
-            real_part: self.real_part.acos(),
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let dval = -BatchScalarF64::<BATCH>::ones()
-                        / (BatchScalarF64::<BATCH>::ones() - self.real_part * self.real_part)
-                            .sqrt();
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
-                            *dij * dval
-                        });
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn asin(self) -> Self {
-        Self {
-            real_part: self.real_part.asin(),
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let dval = BatchScalarF64::<BATCH>::ones()
-                        / (BatchScalarF64::<BATCH>::ones() - self.real_part * self.real_part)
-                            .sqrt();
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
-                            *dij * dval
-                        });
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn atan(self) -> Self {
-        Self {
-            real_part: self.real_part.atan(),
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let dval = BatchScalarF64::<BATCH>::ones()
-                        / (BatchScalarF64::<BATCH>::ones() + self.real_part * self.real_part);
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| {
-                            *dij * dval
-                        });
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn fract(self) -> Self {
-        Self {
-            real_part: self.real_part.fract(),
-            dij_part: match self.dij_part.clone() {
-                Some(dij_val) => {
-                    let dyn_mat =
-                        MutTensorDD::from_map(&dij_val.view(), |dij: &BatchScalarF64<BATCH>| *dij);
-                    Some(dyn_mat)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn floor(&self) -> BatchScalarF64<BATCH> {
-        self.real_part.floor()
-    }
-
-    fn from_f64(val: f64) -> Self {
-        Self::from_real_scalar(BatchScalarF64::<BATCH>::from_f64(val))
-    }
-
-    type DualScalar = Self;
-
-    fn ones() -> Self {
-        Self::from_f64(1.0)
-    }
-
-    fn zeros() -> Self {
-        Self::from_f64(0.0)
-    }
-
-    fn test_suite() {
-        let examples = Self::scalar_examples();
-        for a in &examples {
-            let sin_a = a.clone().sin();
-            let cos_a = a.clone().cos();
-            let val = sin_a.clone() * sin_a + cos_a.clone() * cos_a;
-            let one = Self::ones();
-
-            for i in 0..BATCH {
-                assert_abs_diff_eq!(val.extract_single(i), one.extract_single(i));
-            }
-        }
-    }
-
-    fn less_equal(&self, rhs: &Self) -> Self::Mask {
-        self.real_part.less_equal(&rhs.real_part)
-    }
-
-    fn to_dual(self) -> Self::DualScalar {
-        self
-    }
-
-    fn select(self, mask: &Self::Mask, other: Self) -> Self {
-        Self {
-            real_part: self.real_part.select(mask, other.real_part),
-            dij_part: match (self.dij_part, other.dij_part) {
-                (Some(lhs), Some(rhs)) => {
-                    let dyn_mat = MutTensorDD::from_map2(
-                        &lhs.view(),
-                        &rhs.view(),
-                        |l: &BatchScalarF64<BATCH>, r: &BatchScalarF64<BATCH>| l.select(mask, *r),
-                    );
-                    Some(dyn_mat)
-                }
-                _ => None,
-            },
-        }
-    }
-
-    fn greater_equal(&self, rhs: &Self) -> Self::Mask {
-        self.real_part.greater_equal(&rhs.real_part)
-    }
-}
-
-impl<const BATCH: usize> AddAssign<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn add_assign(&mut self, rhs: Self) {
-        *self = self.clone().add(&rhs);
-    }
-}
-impl<const BATCH: usize> SubAssign<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn sub_assign(&mut self, rhs: Self) {
-        *self = self.clone().sub(&rhs);
-    }
-}
-
-impl<const BATCH: usize> Add<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchScalar<BATCH>;
-    fn add(self, rhs: Self) -> Self::Output {
-        self.add(&rhs)
-    }
-}
-
-impl<const BATCH: usize> Add<&DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchScalar<BATCH>;
-    fn add(self, rhs: &Self) -> Self::Output {
-        let r = self.real_part + rhs.real_part;
-
-        Self {
-            real_part: r,
-            dij_part: Self::binary_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| *l_dij,
-                |r_dij| *r_dij,
-            ),
-        }
-    }
-}
-
-impl<const BATCH: usize> Mul<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchScalar<BATCH>;
-    fn mul(self, rhs: Self) -> Self::Output {
-        self.mul(&rhs)
-    }
-}
-
-impl<const BATCH: usize> Mul<&DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchScalar<BATCH>;
-    fn mul(self, rhs: &Self) -> Self::Output {
-        let r = self.real_part * rhs.real_part;
-
-        Self {
-            real_part: r,
-            dij_part: Self::binary_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| (*l_dij) * rhs.real_part,
-                |r_dij| (*r_dij) * self.real_part,
-            ),
-        }
-    }
-}
-
-impl<const BATCH: usize> Div<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchScalar<BATCH>;
-    fn div(self, rhs: Self) -> Self::Output {
-        self.div(&rhs)
-    }
-}
-
-impl<const BATCH: usize> Div<&DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchScalar<BATCH>;
-    fn div(self, rhs: &Self) -> Self::Output {
-        let rhs_inv = BatchScalarF64::<BATCH>::ones() / rhs.real_part;
-        Self {
-            real_part: self.real_part * rhs_inv,
-            dij_part: Self::binary_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| *l_dij * rhs_inv,
-                |r_dij| -self.real_part * (*r_dij) * rhs_inv * rhs_inv,
-            ),
-        }
-    }
-}
-
-impl<const BATCH: usize> Sub<DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchScalar<BATCH>;
-    fn sub(self, rhs: Self) -> Self::Output {
-        self.sub(&rhs)
-    }
-}
-
-impl<const BATCH: usize> Sub<&DualBatchScalar<BATCH>> for DualBatchScalar<BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchScalar<BATCH>;
-    fn sub(self, rhs: &Self) -> Self::Output {
-        Self {
-            real_part: self.real_part - rhs.real_part,
-            dij_part: Self::binary_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| *l_dij,
-                |r_dij| -(*r_dij),
-            ),
-        }
-    }
-}
-
 #[test]
 fn dual_scalar_tests() {
     use crate::calculus::maps::curves::ScalarValuedCurve;
 
+    #[cfg(feature = "simd")]
+    use crate::calculus::dual::DualBatchScalar;
+    #[cfg(feature = "simd")]
+    use crate::linalg::BatchScalarF64;
+
     trait DualScalarTest {
         fn run_dual_scalar_test();
     }
@@ -1469,11 +755,18 @@ fn dual_scalar_tests() {
     }
 
     def_dual_scalar_test_template!(1, f64, DualScalar);
+    #[cfg(feature = "simd")]
     def_dual_scalar_test_template!(2, BatchScalarF64<2>, DualBatchScalar<2>);
+    #[cfg(feature = "simd")]
     def_dual_scalar_test_template!(4, BatchScalarF64<4>, DualBatchScalar<4>);
+    #[cfg(feature = "simd")]
     def_dual_scalar_test_template!(8, BatchScalarF64<8>, DualBatchScalar<8>);
 
+    DualScalar::run_dual_scalar_test();
+    #[cfg(feature = "simd")]
     DualBatchScalar::<2>::run_dual_scalar_test();
+    #[cfg(feature = "simd")]
     DualBatchScalar::<4>::run_dual_scalar_test();
+    #[cfg(feature = "simd")]
     DualBatchScalar::<8>::run_dual_scalar_test();
 }
diff --git a/crates/sophus_core/src/calculus/dual/dual_vector.rs b/crates/sophus_core/src/calculus/dual/dual_vector.rs
index 40bdc05..d636e10 100644
--- a/crates/sophus_core/src/calculus/dual/dual_vector.rs
+++ b/crates/sophus_core/src/calculus/dual/dual_vector.rs
@@ -1,9 +1,5 @@
 use super::dual_matrix::DualMatrix;
-use super::dual_scalar::DualBatchScalar;
 use super::dual_scalar::DualScalar;
-use crate::calculus::dual::DualBatchMatrix;
-use crate::linalg::BatchScalarF64;
-use crate::linalg::BatchVecF64;
 use crate::linalg::VecF64;
 use crate::prelude::*;
 use crate::tensor::mut_tensor::InnerVecToMat;
@@ -15,9 +11,6 @@ use std::fmt::Debug;
 use std::ops::Add;
 use std::ops::Neg;
 use std::ops::Sub;
-use std::simd::LaneCount;
-use std::simd::Mask;
-use std::simd::SupportedLaneCount;
 
 /// Dual vector
 #[derive(Clone)]
@@ -45,19 +38,6 @@ pub trait IsDualVector<S: IsDualScalar<BATCH>, const ROWS: usize, const BATCH: u
     fn dij_val(self) -> Option<MutTensorDDR<S::RealScalar, ROWS>>;
 }
 
-/// Dual vector (batch version)
-#[derive(Clone, Debug)]
-pub struct DualBatchVector<const ROWS: usize, const BATCH: usize>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    /// real part
-    pub real_part: BatchVecF64<ROWS, BATCH>,
-    /// infinitesimal part - represents derivative
-    pub dij_part: Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>>,
-}
-
 impl<const ROWS: usize> IsDual for DualVector<ROWS> {}
 
 impl<const ROWS: usize> IsDualVector<DualScalar, ROWS, 1> for DualVector<ROWS> {
@@ -100,13 +80,13 @@ where
     }
 }
 
-struct DijPair<S: IsCoreScalar, const R0: usize, const R1: usize> {
-    lhs: MutTensorDDR<S, R0>,
-    rhs: MutTensorDDR<S, R1>,
+pub(crate) struct DijPair<S: IsCoreScalar, const R0: usize, const R1: usize> {
+    pub(crate) lhs: MutTensorDDR<S, R0>,
+    pub(crate) rhs: MutTensorDDR<S, R1>,
 }
 
 impl<S: IsCoreScalar, const R0: usize, const R1: usize> DijPair<S, R0, R1> {
-    fn shape(&self) -> [usize; 2] {
+    pub(crate) fn shape(&self) -> [usize; 2] {
         self.lhs.dims()
     }
 }
@@ -533,511 +513,19 @@ impl<const ROWS: usize> IsVector<DualScalar, ROWS, 1> for DualVector<ROWS> {
     }
 }
 
-impl<const ROWS: usize, const BATCH: usize> num_traits::Zero for DualBatchVector<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn zero() -> Self {
-        DualBatchVector {
-            real_part: BatchVecF64::<ROWS, BATCH>::zeros(),
-            dij_part: None,
-        }
-    }
-
-    fn is_zero(&self) -> bool {
-        self.real_part == BatchVecF64::<ROWS, BATCH>::zeros()
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> IsDual for DualBatchVector<ROWS, BATCH> where
-    LaneCount<BATCH>: SupportedLaneCount
-{
-}
-
-impl<const ROWS: usize, const BATCH: usize> PartialEq for DualBatchVector<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn eq(&self, other: &Self) -> bool {
-        self.real_part == other.real_part && self.dij_part == other.dij_part
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> AbsDiffEq for DualBatchVector<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Epsilon = f64;
-
-    fn default_epsilon() -> Self::Epsilon {
-        f64::default_epsilon()
-    }
-
-    fn abs_diff_eq(&self, other: &Self, epsilon: Self::Epsilon) -> bool {
-        self.real_part.abs_diff_eq(&other.real_part, epsilon)
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> RelativeEq for DualBatchVector<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn default_max_relative() -> Self::Epsilon {
-        f64::default_max_relative()
-    }
-
-    fn relative_eq(
-        &self,
-        other: &Self,
-        epsilon: Self::Epsilon,
-        max_relative: Self::Epsilon,
-    ) -> bool {
-        self.real_part
-            .relative_eq(&other.real_part, epsilon, max_relative)
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> IsDualVector<DualBatchScalar<BATCH>, ROWS, BATCH>
-    for DualBatchVector<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn new_with_dij(val: BatchVecF64<ROWS, BATCH>) -> Self {
-        let mut dij_val = MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape([ROWS, 1]);
-        for i in 0..ROWS {
-            dij_val.mut_view().get_mut([i, 0])[(i, 0)] = BatchScalarF64::<BATCH>::ones();
-        }
-
-        Self {
-            real_part: val,
-            dij_part: Some(dij_val),
-        }
-    }
-
-    fn dij_val(self) -> Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>> {
-        self.dij_part
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> DualBatchVector<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn binary_dij<
-        const R0: usize,
-        const R1: usize,
-        F: FnMut(&BatchVecF64<R0, BATCH>) -> BatchVecF64<ROWS, BATCH>,
-        G: FnMut(&BatchVecF64<R1, BATCH>) -> BatchVecF64<ROWS, BATCH>,
-    >(
-        lhs_dx: &Option<MutTensorDDR<BatchScalarF64<BATCH>, R0>>,
-        rhs_dx: &Option<MutTensorDDR<BatchScalarF64<BATCH>, R1>>,
-        mut left_op: F,
-        mut right_op: G,
-    ) -> Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>> {
-        match (lhs_dx, rhs_dx) {
-            (None, None) => None,
-            (None, Some(rhs_dij)) => {
-                let out_dij = MutTensorDDR::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), None) => {
-                let out_dij = MutTensorDDR::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), Some(rhs_dij)) => {
-                let dyn_mat =
-                    MutTensorDDR::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
-                        left_op(l_dij) + right_op(r_dij)
-                    });
-                Some(dyn_mat)
-            }
-        }
-    }
-
-    fn binary_vs_dij<
-        const R0: usize,
-        F: FnMut(&BatchVecF64<R0, BATCH>) -> BatchVecF64<ROWS, BATCH>,
-        G: FnMut(&BatchScalarF64<BATCH>) -> BatchVecF64<ROWS, BATCH>,
-    >(
-        lhs_dx: &Option<MutTensorDDR<BatchScalarF64<BATCH>, R0>>,
-        rhs_dx: &Option<MutTensorDD<BatchScalarF64<BATCH>>>,
-        mut left_op: F,
-        mut right_op: G,
-    ) -> Option<MutTensorDDR<BatchScalarF64<BATCH>, ROWS>> {
-        match (lhs_dx, rhs_dx) {
-            (None, None) => None,
-            (None, Some(rhs_dij)) => {
-                let out_dij = MutTensorDDR::from_map(&rhs_dij.view(), |r_dij| right_op(r_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), None) => {
-                let out_dij = MutTensorDDR::from_map(&lhs_dij.view(), |l_dij| left_op(l_dij));
-                Some(out_dij)
-            }
-            (Some(lhs_dij), Some(rhs_dij)) => {
-                let dyn_mat =
-                    MutTensorDDR::from_map2(&lhs_dij.view(), &rhs_dij.view(), |l_dij, r_dij| {
-                        left_op(l_dij) + right_op(r_dij)
-                    });
-                Some(dyn_mat)
-            }
-        }
-    }
-
-    fn two_dx<const R0: usize, const R1: usize>(
-        mut lhs_dx: Option<MutTensorDDR<BatchScalarF64<BATCH>, R0>>,
-        mut rhs_dx: Option<MutTensorDDR<BatchScalarF64<BATCH>, R1>>,
-    ) -> Option<DijPair<BatchScalarF64<BATCH>, R0, R1>> {
-        if lhs_dx.is_none() && rhs_dx.is_none() {
-            return None;
-        }
-
-        if lhs_dx.is_some() && rhs_dx.is_some() {
-            assert_eq!(
-                lhs_dx.clone().unwrap().dims(),
-                rhs_dx.clone().unwrap().dims()
-            );
-        }
-
-        if lhs_dx.is_none() {
-            lhs_dx = Some(MutTensorDDR::from_shape(rhs_dx.clone().unwrap().dims()))
-        } else if rhs_dx.is_none() {
-            rhs_dx = Some(MutTensorDDR::from_shape(lhs_dx.clone().unwrap().dims()))
-        }
-
-        Some(DijPair {
-            lhs: lhs_dx.unwrap(),
-            rhs: rhs_dx.unwrap(),
-        })
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> Neg for DualBatchVector<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchVector<ROWS, BATCH>;
-
-    fn neg(self) -> Self::Output {
-        DualBatchVector {
-            real_part: -self.real_part,
-            dij_part: self
-                .dij_part
-                .clone()
-                .map(|dij_val| MutTensorDDR::from_map(&dij_val.view(), |v| -v)),
-        }
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> Sub for DualBatchVector<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchVector<ROWS, BATCH>;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        DualBatchVector {
-            real_part: self.real_part - rhs.real_part,
-            dij_part: Self::binary_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| *l_dij,
-                |r_dij| -r_dij,
-            ),
-        }
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> Add for DualBatchVector<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = DualBatchVector<ROWS, BATCH>;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        DualBatchVector {
-            real_part: self.real_part + rhs.real_part,
-            dij_part: Self::binary_dij(
-                &self.dij_part,
-                &rhs.dij_part,
-                |l_dij| *l_dij,
-                |r_dij| *r_dij,
-            ),
-        }
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> IsVector<DualBatchScalar<BATCH>, ROWS, BATCH>
-    for DualBatchVector<ROWS, BATCH>
-where
-    BatchScalarF64<BATCH>: IsCoreScalar,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn from_f64(val: f64) -> Self {
-        DualBatchVector {
-            real_part:
-                <BatchVecF64<ROWS, BATCH> as IsVector<BatchScalarF64<BATCH>, ROWS, BATCH>>::from_f64(
-                    val,
-                ),
-            dij_part: None,
-        }
-    }
-
-    fn outer<const R2: usize>(
-        self,
-        rhs: DualBatchVector<R2, BATCH>,
-    ) -> DualBatchMatrix<ROWS, R2, BATCH> {
-        let mut result = DualBatchMatrix::zeros();
-        for i in 0..ROWS {
-            for j in 0..R2 {
-                result.set_elem([i, j], self.get_elem(i) * rhs.get_elem(j));
-            }
-        }
-        result
-    }
-
-    fn norm(&self) -> DualBatchScalar<BATCH> {
-        self.clone().dot(self.clone()).sqrt()
-    }
-
-    fn squared_norm(&self) -> DualBatchScalar<BATCH> {
-        self.clone().dot(self.clone())
-    }
-
-    fn get_elem(&self, idx: usize) -> DualBatchScalar<BATCH> {
-        DualBatchScalar {
-            real_part: self.real_part[idx],
-            dij_part: self
-                .dij_part
-                .clone()
-                .map(|dij_val| MutTensorDD::from_map(&dij_val.view(), |v| v[idx])),
-        }
-    }
-
-    fn from_array(duals: [DualBatchScalar<BATCH>; ROWS]) -> Self {
-        let mut shape = None;
-        let mut val_v = BatchVecF64::<ROWS, BATCH>::zeros();
-        for i in 0..duals.len() {
-            let d = duals.clone()[i].clone();
-
-            val_v[i] = d.real_part;
-            if d.dij_part.is_some() {
-                shape = Some(d.dij_part.clone().unwrap().dims());
-            }
-        }
-
-        if shape.is_none() {
-            return DualBatchVector {
-                real_part: val_v,
-                dij_part: None,
-            };
-        }
-        let shape = shape.unwrap();
-
-        let mut r = MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape(shape);
-
-        for i in 0..duals.len() {
-            let d = duals.clone()[i].clone();
-            if d.dij_part.is_some() {
-                for d0 in 0..shape[0] {
-                    for d1 in 0..shape[1] {
-                        r.mut_view().get_mut([d0, d1])[(i, 0)] =
-                            d.dij_part.clone().unwrap().get([d0, d1]);
-                    }
-                }
-            }
-        }
-        DualBatchVector {
-            real_part: val_v,
-            dij_part: Some(r),
-        }
-    }
-
-    fn from_real_array(vals: [BatchScalarF64<BATCH>; ROWS]) -> Self {
-        DualBatchVector {
-            real_part: BatchVecF64::from_real_array(vals),
-            dij_part: None,
-        }
-    }
-
-    fn from_real_vector(val: BatchVecF64<ROWS, BATCH>) -> Self {
-        Self {
-            real_part: val,
-            dij_part: None,
-        }
-    }
-
-    fn real_vector(&self) -> &BatchVecF64<ROWS, BATCH> {
-        &self.real_part
-    }
-
-    fn to_mat(self) -> DualBatchMatrix<ROWS, 1, BATCH> {
-        DualBatchMatrix {
-            real_part: self.real_part,
-            dij_part: self.dij_part.map(|dij| dij.inner_vec_to_mat()),
-        }
-    }
-
-    fn block_vec2<const R0: usize, const R1: usize>(
-        top_row: DualBatchVector<R0, BATCH>,
-        bot_row: DualBatchVector<R1, BATCH>,
-    ) -> Self {
-        assert_eq!(R0 + R1, ROWS);
-
-        let maybe_dij = Self::two_dx(top_row.dij_part, bot_row.dij_part);
-        Self {
-            real_part: BatchVecF64::<ROWS, BATCH>::block_vec2(top_row.real_part, bot_row.real_part),
-            dij_part: match maybe_dij {
-                Some(dij_val) => {
-                    let mut r =
-                        MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape(dij_val.shape());
-                    for d0 in 0..dij_val.shape()[0] {
-                        for d1 in 0..dij_val.shape()[1] {
-                            *r.mut_view().get_mut([d0, d1]) =
-                                BatchVecF64::<ROWS, BATCH>::block_vec2(
-                                    dij_val.lhs.get([d0, d1]),
-                                    dij_val.rhs.get([d0, d1]),
-                                );
-                        }
-                    }
-                    Some(r)
-                }
-                None => None,
-            },
-        }
-    }
-
-    fn scaled(&self, s: DualBatchScalar<BATCH>) -> Self {
-        DualBatchVector {
-            real_part: self.real_part * s.real_part,
-            dij_part: Self::binary_vs_dij(
-                &self.dij_part,
-                &s.dij_part,
-                |l_dij| l_dij * s.real_part,
-                |r_dij| self.real_part * *r_dij,
-            ),
-        }
-    }
-
-    fn dot(self, rhs: Self) -> DualBatchScalar<BATCH> {
-        let mut sum = DualBatchScalar::from_f64(0.0);
-
-        for i in 0..ROWS {
-            sum += self.get_elem(i) * rhs.get_elem(i);
-        }
-
-        sum
-    }
-
-    fn normalized(&self) -> Self {
-        self.clone()
-            .scaled(DualBatchScalar::<BATCH>::from_f64(1.0) / self.norm())
-    }
-
-    fn from_f64_array(vals: [f64; ROWS]) -> Self {
-        DualBatchVector {
-            real_part: BatchVecF64::from_f64_array(vals),
-            dij_part: None,
-        }
-    }
-
-    fn from_scalar_array(vals: [DualBatchScalar<BATCH>; ROWS]) -> Self {
-        let mut shape = None;
-        let mut val_v = BatchVecF64::<ROWS, BATCH>::zeros();
-        for i in 0..vals.len() {
-            let d = vals.clone()[i].clone();
-
-            val_v[i] = d.real_part;
-            if d.dij_part.is_some() {
-                shape = Some(d.dij_part.clone().unwrap().dims());
-            }
-        }
-
-        if shape.is_none() {
-            return DualBatchVector {
-                real_part: val_v,
-                dij_part: None,
-            };
-        }
-        let shape = shape.unwrap();
-
-        let mut r = MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape(shape);
-
-        for i in 0..vals.len() {
-            let d = vals.clone()[i].clone();
-            if d.dij_part.is_some() {
-                for d0 in 0..shape[0] {
-                    for d1 in 0..shape[1] {
-                        r.mut_view().get_mut([d0, d1])[(i, 0)] =
-                            d.dij_part.clone().unwrap().get([d0, d1]);
-                    }
-                }
-            }
-        }
-        DualBatchVector {
-            real_part: val_v,
-            dij_part: Some(r),
-        }
-    }
-
-    fn set_elem(&mut self, idx: usize, v: DualBatchScalar<BATCH>) {
-        self.real_part[idx] = v.real_part;
-        if self.dij_part.is_some() {
-            let dij = &mut self.dij_part.as_mut().unwrap();
-            for i in 0..dij.dims()[0] {
-                for j in 0..dij.dims()[1] {
-                    dij.mut_view().get_mut([i, j])[idx] = v.dij_part.clone().unwrap().get([i, j]);
-                }
-            }
-        }
-    }
-
-    fn to_dual(self) -> <DualBatchScalar<BATCH> as IsScalar<BATCH>>::DualVector<ROWS> {
-        self
-    }
-
-    fn select(self, mask: &Mask<i64, BATCH>, other: Self) -> Self {
-        let maybe_dij = Self::two_dx(self.dij_part, other.dij_part);
-
-        Self {
-            real_part: IsVector::select(self.real_part, mask, other.real_part),
-            dij_part: match maybe_dij {
-                Some(dij) => {
-                    let mut r =
-                        MutTensorDDR::<BatchScalarF64<BATCH>, ROWS>::from_shape(dij.shape());
-                    for i in 0..dij.shape()[0] {
-                        for j in 0..dij.shape()[1] {
-                            *r.get_mut([i, j]) =
-                                IsVector::select(dij.lhs.get([i, j]), mask, dij.rhs.get([i, j]));
-                        }
-                    }
-                    Some(r)
-                }
-                _ => None,
-            },
-        }
-    }
-
-    fn get_fixed_subvec<const R: usize>(&self, start_r: usize) -> DualBatchVector<R, BATCH> {
-        DualBatchVector {
-            real_part: self.real_part.fixed_rows::<R>(start_r).into(),
-            dij_part: self.dij_part.clone().map(|dij_val| {
-                MutTensorDDR::from_map(&dij_val.view(), |v| v.fixed_rows::<R>(start_r).into())
-            }),
-        }
-    }
-}
-
 #[test]
 fn dual_vector_tests() {
-    use crate::calculus::dual::dual_scalar::DualBatchScalar;
     use crate::calculus::dual::dual_scalar::DualScalar;
     use crate::calculus::maps::scalar_valued_maps::ScalarValuedMapFromVector;
     use crate::calculus::maps::vector_valued_maps::VectorValuedMapFromVector;
     use crate::linalg::vector::IsVector;
-    use crate::linalg::BatchScalarF64;
     use crate::points::example_points;
 
+    #[cfg(feature = "simd")]
+    use crate::calculus::dual::DualBatchScalar;
+    #[cfg(feature = "simd")]
+    use crate::linalg::BatchScalarF64;
+
     #[cfg(test)]
     trait Test {
         fn run();
@@ -1132,10 +620,14 @@ fn dual_vector_tests() {
     }
 
     def_test_template!(f64, DualScalar, 1);
+    #[cfg(feature = "simd")]
     def_test_template!(BatchScalarF64<2>, DualBatchScalar<2>, 2);
+    #[cfg(feature = "simd")]
     def_test_template!(BatchScalarF64<4>, DualBatchScalar<4>, 4);
 
     f64::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<2>::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<4>::run();
 }
diff --git a/crates/sophus_core/src/calculus/maps/curves.rs b/crates/sophus_core/src/calculus/maps/curves.rs
index 151200f..49d83ef 100644
--- a/crates/sophus_core/src/calculus/maps/curves.rs
+++ b/crates/sophus_core/src/calculus/maps/curves.rs
@@ -115,9 +115,12 @@ impl<D: IsDualScalar<BATCH>, const BATCH: usize> MatrixValuedCurve<D, BATCH> {
 
 #[test]
 fn curve_test() {
-    use crate::calculus::dual::dual_scalar::DualBatchScalar;
-    use crate::calculus::dual::dual_scalar::DualScalar;
+    use crate::calculus::dual::DualScalar;
     use crate::linalg::scalar::IsScalar;
+
+    #[cfg(feature = "simd")]
+    use crate::calculus::dual::DualBatchScalar;
+    #[cfg(feature = "simd")]
     use crate::linalg::BatchScalarF64;
 
     trait CurveTest {
@@ -196,12 +199,18 @@ fn curve_test() {
     }
 
     def_curve_test_template!(1, f64, DualScalar);
+    #[cfg(feature = "simd")]
     def_curve_test_template!(2, BatchScalarF64<2>, DualBatchScalar<2>);
+    #[cfg(feature = "simd")]
     def_curve_test_template!(4, BatchScalarF64<4>, DualBatchScalar<4>);
+    #[cfg(feature = "simd")]
     def_curve_test_template!(8, BatchScalarF64<8>, DualBatchScalar<8>);
 
     DualScalar::run_curve_test();
+    #[cfg(feature = "simd")]
     DualBatchScalar::<2>::run_curve_test();
+    #[cfg(feature = "simd")]
     DualBatchScalar::<4>::run_curve_test();
+    #[cfg(feature = "simd")]
     DualBatchScalar::<8>::run_curve_test();
 }
diff --git a/crates/sophus_core/src/calculus/maps/matrix_valued_maps.rs b/crates/sophus_core/src/calculus/maps/matrix_valued_maps.rs
index cbbc596..9b5687a 100644
--- a/crates/sophus_core/src/calculus/maps/matrix_valued_maps.rs
+++ b/crates/sophus_core/src/calculus/maps/matrix_valued_maps.rs
@@ -155,15 +155,17 @@ impl<D: IsDualScalar<BATCH, DualScalar = D>, const BATCH: usize>
 
 #[test]
 fn matrix_valued_map_from_vector_tests() {
-    use crate::calculus::dual::dual_scalar::DualBatchScalar;
     use crate::calculus::dual::dual_scalar::DualScalar;
     use crate::calculus::maps::matrix_valued_maps::MatrixValuedMapFromVector;
-
     use crate::linalg::scalar::IsScalar;
     use crate::linalg::vector::IsVector;
-    use crate::linalg::BatchScalarF64;
     use crate::tensor::tensor_view::IsTensorLike;
 
+    #[cfg(feature = "simd")]
+    use crate::calculus::dual::DualBatchScalar;
+    #[cfg(feature = "simd")]
+    use crate::linalg::BatchScalarF64;
+
     #[cfg(test)]
     trait Test {
         fn run();
@@ -284,10 +286,14 @@ fn matrix_valued_map_from_vector_tests() {
     }
 
     def_test_template!(f64, DualScalar, 1);
+    #[cfg(feature = "simd")]
     def_test_template!(BatchScalarF64<2>, DualBatchScalar<2>, 2);
+    #[cfg(feature = "simd")]
     def_test_template!(BatchScalarF64<4>, DualBatchScalar<4>, 4);
 
     f64::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<2>::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<4>::run();
 }
diff --git a/crates/sophus_core/src/calculus/maps/scalar_valued_maps.rs b/crates/sophus_core/src/calculus/maps/scalar_valued_maps.rs
index ddee6d7..d6068fe 100644
--- a/crates/sophus_core/src/calculus/maps/scalar_valued_maps.rs
+++ b/crates/sophus_core/src/calculus/maps/scalar_valued_maps.rs
@@ -132,8 +132,10 @@ impl<D: IsDualScalar<BATCH>, const BATCH: usize> ScalarValuedMapFromMatrix<D, BA
 
 #[test]
 fn scalar_valued_map_tests() {
-    use crate::calculus::dual::dual_scalar::DualBatchScalar;
     use crate::calculus::dual::dual_scalar::DualScalar;
+    #[cfg(feature = "simd")]
+    use crate::calculus::dual::DualBatchScalar;
+    #[cfg(feature = "simd")]
     use crate::linalg::BatchScalarF64;
 
     #[cfg(test)]
@@ -202,18 +204,30 @@ fn scalar_valued_map_tests() {
     }
 
     def_scalar_valued_map_test_template!(1, f64, DualScalar);
+    #[cfg(feature = "simd")]
     def_scalar_valued_map_test_template!(2, BatchScalarF64<2>, DualBatchScalar<2>);
+    #[cfg(feature = "simd")]
     def_scalar_valued_map_test_template!(4, BatchScalarF64<4>, DualBatchScalar<4>);
+    #[cfg(feature = "simd")]
     def_scalar_valued_map_test_template!(8, BatchScalarF64<8>, DualBatchScalar<8>);
+    #[cfg(feature = "simd")]
     def_scalar_valued_map_test_template!(16, BatchScalarF64<16>, DualBatchScalar<16>);
+    #[cfg(feature = "simd")]
     def_scalar_valued_map_test_template!(32, BatchScalarF64<32>, DualBatchScalar<32>);
+    #[cfg(feature = "simd")]
     def_scalar_valued_map_test_template!(64, BatchScalarF64<64>, DualBatchScalar<64>);
 
     f64::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<2>::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<4>::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<8>::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<16>::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<32>::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<64>::run();
 }
diff --git a/crates/sophus_core/src/calculus/maps/vector_valued_maps.rs b/crates/sophus_core/src/calculus/maps/vector_valued_maps.rs
index f63bf08..61c510b 100644
--- a/crates/sophus_core/src/calculus/maps/vector_valued_maps.rs
+++ b/crates/sophus_core/src/calculus/maps/vector_valued_maps.rs
@@ -192,15 +192,18 @@ impl<D: IsDualScalar<BATCH, DualScalar = D>, const BATCH: usize>
 
 #[test]
 fn vector_valued_map_from_vector_tests() {
-    use crate::calculus::dual::dual_scalar::DualBatchScalar;
     use crate::calculus::dual::dual_scalar::DualScalar;
     use crate::calculus::maps::vector_valued_maps::VectorValuedMapFromMatrix;
     use crate::calculus::maps::vector_valued_maps::VectorValuedMapFromVector;
 
     use crate::linalg::vector::IsVector;
-    use crate::linalg::BatchScalarF64;
     use crate::tensor::tensor_view::IsTensorLike;
 
+    #[cfg(feature = "simd")]
+    use crate::calculus::dual::DualBatchScalar;
+    #[cfg(feature = "simd")]
+    use crate::linalg::BatchScalarF64;
+
     #[cfg(test)]
     trait Test {
         fn run();
@@ -306,10 +309,14 @@ fn vector_valued_map_from_vector_tests() {
     }
 
     def_test_template!(f64, DualScalar, 1);
+    #[cfg(feature = "simd")]
     def_test_template!(BatchScalarF64<2>, DualBatchScalar<2>, 2);
+    #[cfg(feature = "simd")]
     def_test_template!(BatchScalarF64<4>, DualBatchScalar<4>, 4);
 
     f64::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<2>::run();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<4>::run();
 }
diff --git a/crates/sophus_core/src/lib.rs b/crates/sophus_core/src/lib.rs
index 05e99d0..fe5c45f 100644
--- a/crates/sophus_core/src/lib.rs
+++ b/crates/sophus_core/src/lib.rs
@@ -1,11 +1,11 @@
-#![feature(portable_simd)]
+#![cfg_attr(feature = "simd", feature(portable_simd))]
 #![deny(missing_docs)]
 #![allow(clippy::needless_range_loop)]
 //! Core math functionality including
 //!  - linear algebra types
 //!      * such as [linalg::VecF64], and [linalg::MatF64]
 //!      * batch types such as [linalg::BatchScalarF64], [linalg::BatchVecF64],
-//!        [linalg::BatchMatF64]
+//!        [linalg::BatchMatF64] - require the `simd` feature
 //!  - tensors
 //!      * design: dynamic tensor (ndarray) of static tensors (nalgebra)
 //!  - differentiation tools
diff --git a/crates/sophus_core/src/linalg.rs b/crates/sophus_core/src/linalg.rs
index d4c1ff1..4244d53 100644
--- a/crates/sophus_core/src/linalg.rs
+++ b/crates/sophus_core/src/linalg.rs
@@ -1,22 +1,43 @@
+#[cfg(feature = "simd")]
 /// Boolean mask - generalization of bool to SIMD
+pub mod batch_mask;
+/// Bool and boolean mask traits
 pub mod bool_mask;
 
+#[cfg(feature = "simd")]
+/// Batch matrix types - require the `simd` feature
+pub mod batch_matrix;
 /// Matrix types
 pub mod matrix;
 
+#[cfg(feature = "simd")]
+/// Batch scalar types - require the `simd` feature
+pub mod batch_scalar;
 /// Scalar types
 pub mod scalar;
 
+#[cfg(feature = "simd")]
+/// Batch vector types - require the `simd` feature
+pub mod batch_vector;
 /// Vector types
 pub mod vector;
 
+#[cfg(feature = "simd")]
 use std::ops::Add;
+
+#[cfg(feature = "simd")]
 use std::simd::cmp::SimdPartialEq;
+#[cfg(feature = "simd")]
 use std::simd::num::SimdFloat;
+#[cfg(feature = "simd")]
 use std::simd::LaneCount;
+#[cfg(feature = "simd")]
 use std::simd::Mask;
+#[cfg(feature = "simd")]
 use std::simd::Simd;
+#[cfg(feature = "simd")]
 use std::simd::SimdElement;
+#[cfg(feature = "simd")]
 use std::simd::SupportedLaneCount;
 
 /// Static vector
@@ -25,6 +46,16 @@ pub type SVec<ScalarLike, const ROWS: usize> = nalgebra::SVector<ScalarLike, ROW
 pub type SMat<ScalarLike, const ROWS: usize, const COLS: usize> =
     nalgebra::SMatrix<ScalarLike, ROWS, COLS>;
 
+/// f32 vector
+pub type VecF32<const ROWS: usize> = nalgebra::SVector<f32, ROWS>;
+/// f64 vector
+pub type VecF64<const ROWS: usize> = nalgebra::SMatrix<f64, ROWS, 1>;
+/// f64 matrix
+pub type MatF32<const ROWS: usize, const COLS: usize> = nalgebra::SMatrix<f32, ROWS, COLS>;
+/// f64 matrix
+pub type MatF64<const ROWS: usize, const COLS: usize> = nalgebra::SMatrix<f64, ROWS, COLS>;
+
+#[cfg(feature = "simd")]
 /// Batch of scalar
 #[derive(Clone, Debug, PartialEq, Copy)]
 pub struct BatchScalar<ScalarLike: SimdElement, const BATCH_SIZE: usize>(
@@ -33,31 +64,28 @@ pub struct BatchScalar<ScalarLike: SimdElement, const BATCH_SIZE: usize>(
 where
     LaneCount<BATCH_SIZE>: SupportedLaneCount;
 
+#[cfg(feature = "simd")]
 /// Batch of vectors
 pub type BatchVec<ScalarLike, const ROWS: usize, const BATCH_SIZE: usize> =
     nalgebra::SVector<BatchScalar<ScalarLike, BATCH_SIZE>, ROWS>;
 
+#[cfg(feature = "simd")]
 /// Batch of matrices
 pub type BatchMat<ScalarLike, const ROWS: usize, const COLS: usize, const BATCH_SIZE: usize> =
     nalgebra::SMatrix<BatchScalar<ScalarLike, BATCH_SIZE>, ROWS, COLS>;
 
-/// f32 vector
-pub type VecF32<const ROWS: usize> = nalgebra::SVector<f32, ROWS>;
-/// f64 vector
-pub type VecF64<const ROWS: usize> = nalgebra::SMatrix<f64, ROWS, 1>;
-/// f64 matrix
-pub type MatF32<const ROWS: usize, const COLS: usize> = nalgebra::SMatrix<f32, ROWS, COLS>;
-/// f64 matrix
-pub type MatF64<const ROWS: usize, const COLS: usize> = nalgebra::SMatrix<f64, ROWS, COLS>;
-
+#[cfg(feature = "simd")]
 /// batch of f64 scalars
 pub type BatchScalarF64<const BATCH: usize> = BatchScalar<f64, BATCH>;
 /// batch of f64 vectors
+#[cfg(feature = "simd")]
 pub type BatchVecF64<const ROWS: usize, const BATCH: usize> = BatchVec<f64, ROWS, BATCH>;
 /// batch of f64 matrices
+#[cfg(feature = "simd")]
 pub type BatchMatF64<const ROWS: usize, const COLS: usize, const BATCH: usize> =
     BatchMat<f64, ROWS, COLS, BATCH>;
 
+#[cfg(feature = "simd")]
 impl<S: SimdElement + num_traits::Zero, const BATCH_SIZE: usize> Add for BatchScalar<S, BATCH_SIZE>
 where
     LaneCount<BATCH_SIZE>: SupportedLaneCount,
@@ -70,6 +98,7 @@ where
     }
 }
 
+#[cfg(feature = "simd")]
 impl<S: SimdElement + num_traits::Zero, const BATCH_SIZE: usize> num_traits::Zero
     for BatchScalar<S, BATCH_SIZE>
 where
@@ -90,11 +119,17 @@ where
 
 #[test]
 fn test_core() {
-    use crate::linalg::scalar::IsScalar;
     use approx::assert_abs_diff_eq;
 
     let vec = SVec::<f32, 3>::new(1.0, 2.0, 3.0);
     assert_abs_diff_eq!(vec, SVec::<f32, 3>::new(1.0, 2.0, 3.0));
+}
+
+#[test]
+#[cfg(feature = "simd")]
+fn test_simd_core() {
+    use crate::linalg::scalar::IsScalar;
+    use approx::assert_abs_diff_eq;
 
     let batch_scalar = BatchScalar::<f64, 4>(Simd::<f64, 4>::from_array([1.0, 2.0, 3.0, 4.0]));
     assert_abs_diff_eq!(
diff --git a/crates/sophus_core/src/linalg/batch_mask.rs b/crates/sophus_core/src/linalg/batch_mask.rs
new file mode 100644
index 0000000..2862350
--- /dev/null
+++ b/crates/sophus_core/src/linalg/batch_mask.rs
@@ -0,0 +1,35 @@
+use crate::prelude::IsBoolMask;
+use std::simd::LaneCount;
+use std::simd::Mask;
+use std::simd::MaskElement;
+use std::simd::SupportedLaneCount;
+
+impl<const BATCH: usize, T> IsBoolMask for Mask<T, BATCH>
+where
+    T: MaskElement,
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn all_true() -> Self {
+        Mask::from_array([true; BATCH])
+    }
+
+    fn all_false() -> Self {
+        Mask::from_array([false; BATCH])
+    }
+
+    fn all(&self) -> bool {
+        Mask::all(*self)
+    }
+
+    fn any(&self) -> bool {
+        Mask::any(*self)
+    }
+
+    fn count(&self) -> usize {
+        self.to_array().iter().filter(|x| **x).count()
+    }
+
+    fn lanes(&self) -> usize {
+        BATCH
+    }
+}
diff --git a/crates/sophus_core/src/linalg/batch_matrix.rs b/crates/sophus_core/src/linalg/batch_matrix.rs
new file mode 100644
index 0000000..cfc95d8
--- /dev/null
+++ b/crates/sophus_core/src/linalg/batch_matrix.rs
@@ -0,0 +1,148 @@
+use crate::calculus::dual::DualBatchMatrix;
+
+use crate::linalg::BatchMatF64;
+use crate::linalg::BatchScalarF64;
+use crate::linalg::BatchVecF64;
+
+use crate::prelude::*;
+
+use std::simd::LaneCount;
+use std::simd::Mask;
+use std::simd::SupportedLaneCount;
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize>
+    IsMatrix<BatchScalarF64<BATCH>, ROWS, COLS, BATCH> for BatchMatF64<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn from_scalar(val: BatchScalarF64<BATCH>) -> Self {
+        Self::from_element(val)
+    }
+
+    fn from_real_matrix(val: Self) -> Self {
+        val
+    }
+
+    fn real_matrix(&self) -> &Self {
+        self
+    }
+
+    fn scaled(&self, v: BatchScalarF64<BATCH>) -> Self {
+        self * v
+    }
+
+    fn identity() -> Self {
+        nalgebra::SMatrix::<BatchScalarF64<BATCH>, ROWS, COLS>::identity()
+    }
+
+    fn from_array2(vals: [[BatchScalarF64<BATCH>; COLS]; ROWS]) -> Self {
+        Self::from_fn(|r, c| vals[r][c])
+    }
+
+    fn from_real_scalar_array2(vals: [[BatchScalarF64<BATCH>; COLS]; ROWS]) -> Self {
+        Self::from_fn(|r, c| vals[r][c])
+    }
+
+    fn from_f64_array2(vals: [[f64; COLS]; ROWS]) -> Self {
+        Self::from_fn(|r, c| BatchScalarF64::<BATCH>::from_f64(vals[r][c]))
+    }
+
+    fn get_elem(&self, idx: [usize; 2]) -> BatchScalarF64<BATCH> {
+        self[(idx[0], idx[1])]
+    }
+
+    fn block_mat2x1<const R0: usize, const R1: usize>(
+        top_row: BatchMatF64<R0, COLS, BATCH>,
+        bot_row: BatchMatF64<R1, COLS, BATCH>,
+    ) -> Self {
+        Self::from_fn(|r, c| {
+            if r < R0 {
+                top_row[(r, c)]
+            } else {
+                bot_row[(r - R0, c)]
+            }
+        })
+    }
+
+    fn block_mat1x2<const C0: usize, const C1: usize>(
+        left_col: BatchMatF64<ROWS, C0, BATCH>,
+        righ_col: BatchMatF64<ROWS, C1, BATCH>,
+    ) -> Self {
+        Self::from_fn(|r, c| {
+            if c < C0 {
+                left_col[(r, c)]
+            } else {
+                righ_col[(r, c - C0)]
+            }
+        })
+    }
+
+    fn block_mat2x2<const R0: usize, const R1: usize, const C0: usize, const C1: usize>(
+        top_row: (BatchMatF64<R0, C0, BATCH>, BatchMatF64<R0, C1, BATCH>),
+        bot_row: (BatchMatF64<R1, C0, BATCH>, BatchMatF64<R1, C1, BATCH>),
+    ) -> Self {
+        Self::from_fn(|r, c| {
+            if r < R0 {
+                if c < C0 {
+                    top_row.0[(r, c)]
+                } else {
+                    top_row.1[(r, c - C0)]
+                }
+            } else if c < C0 {
+                bot_row.0[(r - R0, c)]
+            } else {
+                bot_row.1[(r - R0, c - C0)]
+            }
+        })
+    }
+
+    fn mat_mul<const C2: usize>(
+        &self,
+        other: BatchMatF64<COLS, C2, BATCH>,
+    ) -> BatchMatF64<ROWS, C2, BATCH> {
+        self * other
+    }
+
+    fn set_col_vec(&mut self, c: usize, v: BatchVecF64<ROWS, BATCH>) {
+        self.fixed_columns_mut::<1>(c).copy_from(&v);
+    }
+
+    fn get_fixed_submat<const R: usize, const C: usize>(
+        &self,
+        start_r: usize,
+        start_c: usize,
+    ) -> BatchMatF64<R, C, BATCH> {
+        self.fixed_view::<R, C>(start_r, start_c).into()
+    }
+
+    fn get_col_vec(&self, c: usize) -> BatchVecF64<ROWS, BATCH> {
+        self.fixed_view::<ROWS, 1>(0, c).into()
+    }
+
+    fn get_row_vec(&self, r: usize) -> BatchVecF64<COLS, BATCH> {
+        self.fixed_view::<1, COLS>(r, 0).transpose()
+    }
+
+    fn from_f64(val: f64) -> Self {
+        Self::from_element(BatchScalarF64::<BATCH>::from_f64(val))
+    }
+
+    fn to_dual(self) -> <BatchScalarF64<BATCH> as IsScalar<BATCH>>::DualMatrix<ROWS, COLS> {
+        DualBatchMatrix::from_real_matrix(self)
+    }
+
+    fn select(self, mask: &Mask<i64, BATCH>, other: Self) -> Self {
+        self.zip_map(&other, |a, b| a.select(mask, b))
+    }
+
+    fn set_elem(&mut self, idx: [usize; 2], val: BatchScalarF64<BATCH>) {
+        self[(idx[0], idx[1])] = val;
+    }
+}
+
+impl<const ROWS: usize, const COLS: usize, const BATCH: usize>
+    IsRealMatrix<BatchScalarF64<BATCH>, ROWS, COLS, BATCH> for BatchMatF64<ROWS, COLS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+}
diff --git a/crates/sophus_core/src/linalg/batch_scalar.rs b/crates/sophus_core/src/linalg/batch_scalar.rs
new file mode 100644
index 0000000..c2bdaad
--- /dev/null
+++ b/crates/sophus_core/src/linalg/batch_scalar.rs
@@ -0,0 +1,318 @@
+use crate::calculus::dual::DualBatchMatrix;
+use crate::calculus::dual::DualBatchScalar;
+use crate::calculus::dual::DualBatchVector;
+use crate::linalg::scalar::IsBatchScalar;
+use crate::linalg::scalar::NumberCategory;
+use crate::linalg::BatchMatF64;
+use crate::linalg::BatchScalar;
+use crate::linalg::BatchScalarF64;
+use crate::linalg::BatchVecF64;
+use crate::prelude::IsCoreScalar;
+use crate::prelude::*;
+use approx::AbsDiffEq;
+use approx::RelativeEq;
+use std::fmt::Debug;
+use std::ops::AddAssign;
+use std::ops::Div;
+use std::ops::Mul;
+use std::ops::MulAssign;
+use std::ops::Neg;
+use std::ops::Sub;
+use std::ops::SubAssign;
+use std::simd::cmp::SimdPartialOrd;
+use std::simd::num::SimdFloat;
+use std::simd::LaneCount;
+use std::simd::Mask;
+use std::simd::Simd;
+use std::simd::SimdElement;
+use std::simd::StdFloat;
+use std::simd::SupportedLaneCount;
+
+impl<S: SimdElement + IsCoreScalar, const BATCH_SIZE: usize> IsCoreScalar
+    for BatchScalar<S, BATCH_SIZE>
+where
+    LaneCount<BATCH_SIZE>: SupportedLaneCount,
+    Simd<S, BATCH_SIZE>: SimdFloat,
+    BatchScalar<S, BATCH_SIZE>:
+        Clone + Debug + nalgebra::Scalar + num_traits::Zero + std::ops::AddAssign,
+{
+    fn number_category() -> NumberCategory {
+        NumberCategory::Real
+    }
+}
+
+impl<const BATCH: usize> AbsDiffEq for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Epsilon = f64;
+
+    fn default_epsilon() -> f64 {
+        f64::default_epsilon()
+    }
+
+    fn abs_diff_eq(&self, other: &Self, epsilon: f64) -> bool {
+        for i in 0..BATCH {
+            if !self.0[i].abs_diff_eq(&other.0[i], epsilon) {
+                return false;
+            }
+        }
+        true
+    }
+}
+
+impl<const BATCH: usize> RelativeEq for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn default_max_relative() -> Self::Epsilon {
+        f64::default_max_relative()
+    }
+
+    fn relative_eq(
+        &self,
+        other: &Self,
+        epsilon: Self::Epsilon,
+        max_relative: Self::Epsilon,
+    ) -> bool {
+        for i in 0..BATCH {
+            if !self.0[i].relative_eq(&other.0[i], epsilon, max_relative) {
+                return false;
+            }
+        }
+        true
+    }
+}
+
+impl<const BATCH: usize> AddAssign for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn add_assign(&mut self, rhs: Self) {
+        self.0 += rhs.0;
+    }
+}
+
+impl<const BATCH: usize> SubAssign for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn sub_assign(&mut self, rhs: Self) {
+        self.0 -= rhs.0;
+    }
+}
+
+impl<const BATCH: usize> MulAssign for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn mul_assign(&mut self, rhs: Self) {
+        self.0 *= rhs.0;
+    }
+}
+
+impl<const BATCH: usize> Neg for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = Self;
+
+    fn neg(self) -> Self {
+        BatchScalarF64 { 0: -self.0 }
+    }
+}
+
+impl<const BATCH: usize> Sub for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = Self;
+
+    fn sub(self, rhs: Self) -> Self {
+        BatchScalarF64 { 0: self.0 - rhs.0 }
+    }
+}
+
+impl<const BATCH: usize> Mul for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = Self;
+
+    fn mul(self, rhs: Self) -> Self {
+        BatchScalarF64 { 0: self.0 * rhs.0 }
+    }
+}
+
+impl<const BATCH: usize> Div for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Output = Self;
+
+    fn div(self, rhs: Self) -> Self {
+        BatchScalarF64 { 0: self.0 / rhs.0 }
+    }
+}
+
+impl<const BATCH: usize> num_traits::One for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn one() -> Self {
+        Self(Simd::<f64, BATCH>::splat(1.0))
+    }
+}
+impl<const BATCH: usize> IsRealScalar<BATCH> for BatchScalarF64<BATCH> where
+    LaneCount<BATCH>: SupportedLaneCount
+{
+}
+
+impl<const BATCH: usize> IsBatchScalar<BATCH> for BatchScalarF64<BATCH> where
+    LaneCount<BATCH>: SupportedLaneCount
+{
+}
+
+impl<const BATCH: usize> IsScalar<BATCH> for BatchScalarF64<BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    type Scalar = BatchScalarF64<BATCH>;
+    type RealScalar = Self;
+    type SingleScalar = f64;
+    type DualScalar = DualBatchScalar<BATCH>;
+
+    type RealVector<const ROWS: usize> = BatchVecF64<ROWS, BATCH>;
+
+    type Vector<const ROWS: usize> = BatchVecF64<ROWS, BATCH>;
+    type Matrix<const ROWS: usize, const COLS: usize> = BatchMatF64<ROWS, COLS, BATCH>;
+    type RealMatrix<const ROWS: usize, const COLS: usize> = BatchMatF64<ROWS, COLS, BATCH>;
+
+    type Mask = Mask<i64, BATCH>;
+
+    fn less_equal(&self, rhs: &Self) -> Self::Mask {
+        self.0.simd_le(rhs.0)
+    }
+
+    fn greater_equal(&self, rhs: &Self) -> Self::Mask {
+        self.0.simd_ge(rhs.0)
+    }
+
+    fn scalar_examples() -> Vec<BatchScalarF64<BATCH>> {
+        vec![
+            BatchScalarF64::<BATCH>::from_f64(1.0),
+            BatchScalarF64::<BATCH>::from_f64(2.0),
+            BatchScalarF64::<BATCH>::from_f64(3.0),
+        ]
+    }
+
+    fn extract_single(&self, i: usize) -> f64 {
+        self.0[i]
+    }
+
+    fn from_real_scalar(val: BatchScalarF64<BATCH>) -> Self {
+        val
+    }
+
+    fn real_part(&self) -> Self {
+        *self
+    }
+
+    fn from_f64(val: f64) -> Self {
+        BatchScalarF64 {
+            0: Simd::<f64, BATCH>::splat(val),
+        }
+    }
+
+    fn abs(self) -> Self {
+        BatchScalarF64 {
+            0: SimdFloat::abs(self.0),
+        }
+    }
+
+    fn from_real_array(arr: [f64; BATCH]) -> Self {
+        BatchScalarF64 {
+            0: Simd::<f64, BATCH>::from_array(arr),
+        }
+    }
+
+    fn to_real_array(&self) -> [f64; BATCH] {
+        self.0.to_array()
+    }
+
+    fn cos(self) -> Self {
+        BatchScalarF64 {
+            0: sleef::Sleef::cos(self.0),
+        }
+    }
+
+    fn sin(self) -> Self {
+        BatchScalarF64 {
+            0: sleef::Sleef::sin(self.0),
+        }
+    }
+
+    fn tan(self) -> Self {
+        BatchScalarF64 {
+            0: sleef::Sleef::tan(self.0),
+        }
+    }
+
+    fn acos(self) -> Self {
+        BatchScalarF64 {
+            0: sleef::Sleef::acos(self.0),
+        }
+    }
+
+    fn asin(self) -> Self {
+        BatchScalarF64 {
+            0: sleef::Sleef::asin(self.0),
+        }
+    }
+
+    fn atan(self) -> Self {
+        BatchScalarF64 {
+            0: sleef::Sleef::atan(self.0),
+        }
+    }
+
+    fn sqrt(self) -> Self {
+        BatchScalarF64 { 0: self.0.sqrt() }
+    }
+
+    fn atan2(self, x: Self) -> Self {
+        BatchScalarF64 {
+            0: sleef::Sleef::atan2(self.0, x.0),
+        }
+    }
+
+    fn to_vec(self) -> Self::Vector<1> {
+        BatchVecF64::<1, BATCH>::from_scalar(self)
+    }
+
+    fn fract(self) -> Self {
+        BatchScalarF64 { 0: self.0.fract() }
+    }
+
+    fn floor(&self) -> BatchScalarF64<BATCH> {
+        BatchScalarF64 { 0: self.0.floor() }
+    }
+
+    fn signum(&self) -> Self {
+        BatchScalarF64 { 0: self.0.signum() }
+    }
+
+    type DualVector<const ROWS: usize> = DualBatchVector<ROWS, BATCH>;
+
+    type DualMatrix<const ROWS: usize, const COLS: usize> = DualBatchMatrix<ROWS, COLS, BATCH>;
+
+    fn to_dual(self) -> Self::DualScalar {
+        DualBatchScalar::from_real_scalar(self)
+    }
+
+    fn select(self, mask: &Self::Mask, other: Self) -> Self {
+        BatchScalarF64 {
+            0: mask.select(self.0, other.0),
+        }
+    }
+}
diff --git a/crates/sophus_core/src/linalg/batch_vector.rs b/crates/sophus_core/src/linalg/batch_vector.rs
new file mode 100644
index 0000000..8509356
--- /dev/null
+++ b/crates/sophus_core/src/linalg/batch_vector.rs
@@ -0,0 +1,119 @@
+use crate::calculus::dual::DualBatchVector;
+use crate::linalg::BatchMatF64;
+use crate::linalg::BatchScalarF64;
+use crate::linalg::BatchVecF64;
+use crate::prelude::*;
+use std::simd::LaneCount;
+use std::simd::Mask;
+use std::simd::SupportedLaneCount;
+
+impl<const ROWS: usize, const BATCH: usize> IsVector<BatchScalarF64<BATCH>, ROWS, BATCH>
+    for BatchVecF64<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+    fn block_vec2<const R0: usize, const R1: usize>(
+        top_row: BatchVecF64<R0, BATCH>,
+        bot_row: BatchVecF64<R1, BATCH>,
+    ) -> Self {
+        assert_eq!(ROWS, R0 + R1);
+        let mut m = Self::zeros();
+
+        m.fixed_view_mut::<R0, 1>(0, 0).copy_from(&top_row);
+        m.fixed_view_mut::<R1, 1>(R0, 0).copy_from(&bot_row);
+        m
+    }
+
+    fn dot(self, rhs: Self) -> BatchScalarF64<BATCH> {
+        (self.transpose() * rhs)[0]
+    }
+
+    fn from_array(vals: [BatchScalarF64<BATCH>; ROWS]) -> Self {
+        Self::from_fn(|i, _| vals[i])
+    }
+
+    fn from_real_array(vals: [BatchScalarF64<BATCH>; ROWS]) -> Self {
+        Self::from_fn(|i, _| vals[i])
+    }
+
+    fn from_f64_array(vals: [f64; ROWS]) -> Self {
+        Self::from_fn(|i, _| BatchScalarF64::<BATCH>::from_f64(vals[i]))
+    }
+
+    fn from_scalar_array(vals: [BatchScalarF64<BATCH>; ROWS]) -> Self {
+        Self::from_fn(|i, _| vals[i])
+    }
+
+    fn from_real_vector(val: BatchVecF64<ROWS, BATCH>) -> Self {
+        val
+    }
+
+    fn get_elem(&self, idx: usize) -> BatchScalarF64<BATCH> {
+        self[idx]
+    }
+
+    fn norm(&self) -> BatchScalarF64<BATCH> {
+        self.squared_norm().sqrt()
+    }
+
+    fn normalized(&self) -> Self {
+        let norm = self.norm();
+        if norm == <BatchScalarF64<BATCH> as IsScalar<BATCH>>::zeros() {
+            return *self;
+        }
+        let factor = BatchScalarF64::<BATCH>::ones() / norm;
+        self * factor
+    }
+
+    fn real_vector(&self) -> &BatchVecF64<ROWS, BATCH> {
+        self
+    }
+
+    fn scaled(&self, v: BatchScalarF64<BATCH>) -> Self {
+        self * v
+    }
+
+    fn set_elem(&mut self, idx: usize, v: BatchScalarF64<BATCH>) {
+        self[idx] = v;
+    }
+
+    fn squared_norm(&self) -> BatchScalarF64<BATCH> {
+        let mut squared_norm = <BatchScalarF64<BATCH> as IsScalar<BATCH>>::zeros();
+        for i in 0..ROWS {
+            let val = IsVector::get_elem(self, i);
+            squared_norm += val * val;
+        }
+        squared_norm
+    }
+
+    fn to_mat(self) -> BatchMatF64<ROWS, 1, BATCH> {
+        self
+    }
+
+    fn from_f64(val: f64) -> Self {
+        Self::from_element(BatchScalarF64::<BATCH>::from_f64(val))
+    }
+
+    fn to_dual(self) -> <BatchScalarF64<BATCH> as IsScalar<BATCH>>::DualVector<ROWS> {
+        DualBatchVector::from_real_vector(self)
+    }
+
+    fn outer<const R2: usize>(self, rhs: BatchVecF64<R2, BATCH>) -> BatchMatF64<ROWS, R2, BATCH> {
+        self * rhs.transpose()
+    }
+
+    fn select(self, mask: &Mask<i64, BATCH>, other: Self) -> Self {
+        self.zip_map(&other, |a, b| a.select(mask, b))
+    }
+
+    fn get_fixed_subvec<const R: usize>(&self, start_r: usize) -> BatchVecF64<R, BATCH> {
+        self.fixed_rows::<R>(start_r).into()
+    }
+}
+
+impl<const ROWS: usize, const BATCH: usize> IsRealVector<BatchScalarF64<BATCH>, ROWS, BATCH>
+    for BatchVecF64<ROWS, BATCH>
+where
+    LaneCount<BATCH>: SupportedLaneCount,
+{
+}
diff --git a/crates/sophus_core/src/linalg/bool_mask.rs b/crates/sophus_core/src/linalg/bool_mask.rs
index 9448909..7c2d22c 100644
--- a/crates/sophus_core/src/linalg/bool_mask.rs
+++ b/crates/sophus_core/src/linalg/bool_mask.rs
@@ -1,8 +1,3 @@
-use std::simd::LaneCount;
-use std::simd::Mask;
-use std::simd::MaskElement;
-use std::simd::SupportedLaneCount;
-
 /// Boolean mask - generalization of boolean comparison to SIMDs
 pub trait IsBoolMask {
     /// Mask with all lanes set to true
@@ -51,33 +46,3 @@ impl IsBoolMask for bool {
         1
     }
 }
-
-impl<const BATCH: usize, T> IsBoolMask for Mask<T, BATCH>
-where
-    T: MaskElement,
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn all_true() -> Self {
-        Mask::from_array([true; BATCH])
-    }
-
-    fn all_false() -> Self {
-        Mask::from_array([false; BATCH])
-    }
-
-    fn all(&self) -> bool {
-        Mask::all(*self)
-    }
-
-    fn any(&self) -> bool {
-        Mask::any(*self)
-    }
-
-    fn count(&self) -> usize {
-        self.to_array().iter().filter(|x| **x).count()
-    }
-
-    fn lanes(&self) -> usize {
-        BATCH
-    }
-}
diff --git a/crates/sophus_core/src/linalg/matrix.rs b/crates/sophus_core/src/linalg/matrix.rs
index 6c7cd2f..61ec567 100644
--- a/crates/sophus_core/src/linalg/matrix.rs
+++ b/crates/sophus_core/src/linalg/matrix.rs
@@ -1,8 +1,4 @@
-use crate::calculus::dual::DualBatchMatrix;
 use crate::calculus::dual::DualMatrix;
-use crate::linalg::BatchMatF64;
-use crate::linalg::BatchScalarF64;
-use crate::linalg::BatchVecF64;
 use crate::linalg::MatF64;
 use crate::linalg::VecF64;
 use crate::prelude::*;
@@ -15,9 +11,6 @@ use std::ops::IndexMut;
 use std::ops::Mul;
 use std::ops::Neg;
 use std::ops::Sub;
-use std::simd::LaneCount;
-use std::simd::Mask;
-use std::simd::SupportedLaneCount;
 
 /// Matrix trait
 ///  - either a real (f64) or a dual number matrix
@@ -303,140 +296,3 @@ impl<const ROWS: usize, const COLS: usize> IsMatrix<f64, ROWS, COLS, 1> for MatF
         self[(idx[0], idx[1])] = val;
     }
 }
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize>
-    IsMatrix<BatchScalarF64<BATCH>, ROWS, COLS, BATCH> for BatchMatF64<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn from_scalar(val: BatchScalarF64<BATCH>) -> Self {
-        Self::from_element(val)
-    }
-
-    fn from_real_matrix(val: Self) -> Self {
-        val
-    }
-
-    fn real_matrix(&self) -> &Self {
-        self
-    }
-
-    fn scaled(&self, v: BatchScalarF64<BATCH>) -> Self {
-        self * v
-    }
-
-    fn identity() -> Self {
-        nalgebra::SMatrix::<BatchScalarF64<BATCH>, ROWS, COLS>::identity()
-    }
-
-    fn from_array2(vals: [[BatchScalarF64<BATCH>; COLS]; ROWS]) -> Self {
-        Self::from_fn(|r, c| vals[r][c])
-    }
-
-    fn from_real_scalar_array2(vals: [[BatchScalarF64<BATCH>; COLS]; ROWS]) -> Self {
-        Self::from_fn(|r, c| vals[r][c])
-    }
-
-    fn from_f64_array2(vals: [[f64; COLS]; ROWS]) -> Self {
-        Self::from_fn(|r, c| BatchScalarF64::<BATCH>::from_f64(vals[r][c]))
-    }
-
-    fn get_elem(&self, idx: [usize; 2]) -> BatchScalarF64<BATCH> {
-        self[(idx[0], idx[1])]
-    }
-
-    fn block_mat2x1<const R0: usize, const R1: usize>(
-        top_row: BatchMatF64<R0, COLS, BATCH>,
-        bot_row: BatchMatF64<R1, COLS, BATCH>,
-    ) -> Self {
-        Self::from_fn(|r, c| {
-            if r < R0 {
-                top_row[(r, c)]
-            } else {
-                bot_row[(r - R0, c)]
-            }
-        })
-    }
-
-    fn block_mat1x2<const C0: usize, const C1: usize>(
-        left_col: BatchMatF64<ROWS, C0, BATCH>,
-        righ_col: BatchMatF64<ROWS, C1, BATCH>,
-    ) -> Self {
-        Self::from_fn(|r, c| {
-            if c < C0 {
-                left_col[(r, c)]
-            } else {
-                righ_col[(r, c - C0)]
-            }
-        })
-    }
-
-    fn block_mat2x2<const R0: usize, const R1: usize, const C0: usize, const C1: usize>(
-        top_row: (BatchMatF64<R0, C0, BATCH>, BatchMatF64<R0, C1, BATCH>),
-        bot_row: (BatchMatF64<R1, C0, BATCH>, BatchMatF64<R1, C1, BATCH>),
-    ) -> Self {
-        Self::from_fn(|r, c| {
-            if r < R0 {
-                if c < C0 {
-                    top_row.0[(r, c)]
-                } else {
-                    top_row.1[(r, c - C0)]
-                }
-            } else if c < C0 {
-                bot_row.0[(r - R0, c)]
-            } else {
-                bot_row.1[(r - R0, c - C0)]
-            }
-        })
-    }
-
-    fn mat_mul<const C2: usize>(
-        &self,
-        other: BatchMatF64<COLS, C2, BATCH>,
-    ) -> BatchMatF64<ROWS, C2, BATCH> {
-        self * other
-    }
-
-    fn set_col_vec(&mut self, c: usize, v: BatchVecF64<ROWS, BATCH>) {
-        self.fixed_columns_mut::<1>(c).copy_from(&v);
-    }
-
-    fn get_fixed_submat<const R: usize, const C: usize>(
-        &self,
-        start_r: usize,
-        start_c: usize,
-    ) -> BatchMatF64<R, C, BATCH> {
-        self.fixed_view::<R, C>(start_r, start_c).into()
-    }
-
-    fn get_col_vec(&self, c: usize) -> BatchVecF64<ROWS, BATCH> {
-        self.fixed_view::<ROWS, 1>(0, c).into()
-    }
-
-    fn get_row_vec(&self, r: usize) -> BatchVecF64<COLS, BATCH> {
-        self.fixed_view::<1, COLS>(r, 0).transpose()
-    }
-
-    fn from_f64(val: f64) -> Self {
-        Self::from_element(BatchScalarF64::<BATCH>::from_f64(val))
-    }
-
-    fn to_dual(self) -> <BatchScalarF64<BATCH> as IsScalar<BATCH>>::DualMatrix<ROWS, COLS> {
-        DualBatchMatrix::from_real_matrix(self)
-    }
-
-    fn select(self, mask: &Mask<i64, BATCH>, other: Self) -> Self {
-        self.zip_map(&other, |a, b| a.select(mask, b))
-    }
-
-    fn set_elem(&mut self, idx: [usize; 2], val: BatchScalarF64<BATCH>) {
-        self[(idx[0], idx[1])] = val;
-    }
-}
-
-impl<const ROWS: usize, const COLS: usize, const BATCH: usize>
-    IsRealMatrix<BatchScalarF64<BATCH>, ROWS, COLS, BATCH> for BatchMatF64<ROWS, COLS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-}
diff --git a/crates/sophus_core/src/linalg/scalar.rs b/crates/sophus_core/src/linalg/scalar.rs
index b9613f6..d7f3293 100644
--- a/crates/sophus_core/src/linalg/scalar.rs
+++ b/crates/sophus_core/src/linalg/scalar.rs
@@ -1,13 +1,6 @@
-use crate::calculus::dual::DualBatchMatrix;
-use crate::calculus::dual::DualBatchScalar;
-use crate::calculus::dual::DualBatchVector;
 use crate::calculus::dual::DualMatrix;
 use crate::calculus::dual::DualScalar;
 use crate::calculus::dual::DualVector;
-use crate::linalg::BatchMatF64;
-use crate::linalg::BatchScalar;
-use crate::linalg::BatchScalarF64;
-use crate::linalg::BatchVecF64;
 use crate::linalg::MatF64;
 use crate::linalg::VecF64;
 use crate::prelude::*;
@@ -20,18 +13,9 @@ use std::ops::Add;
 use std::ops::AddAssign;
 use std::ops::Div;
 use std::ops::Mul;
-use std::ops::MulAssign;
 use std::ops::Neg;
 use std::ops::Sub;
 use std::ops::SubAssign;
-use std::simd::cmp::SimdPartialOrd;
-use std::simd::num::SimdFloat;
-use std::simd::LaneCount;
-use std::simd::Mask;
-use std::simd::Simd;
-use std::simd::SimdElement;
-use std::simd::StdFloat;
-use std::simd::SupportedLaneCount;
 
 /// Number category
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
@@ -73,19 +57,6 @@ def_is_tensor_scalar_single!(i64, NumberCategory::Signed);
 def_is_tensor_scalar_single!(f32, NumberCategory::Real);
 def_is_tensor_scalar_single!(f64, NumberCategory::Real);
 
-impl<S: SimdElement + IsCoreScalar, const BATCH_SIZE: usize> IsCoreScalar
-    for BatchScalar<S, BATCH_SIZE>
-where
-    LaneCount<BATCH_SIZE>: SupportedLaneCount,
-    Simd<S, BATCH_SIZE>: SimdFloat,
-    BatchScalar<S, BATCH_SIZE>:
-        Clone + Debug + nalgebra::Scalar + num_traits::Zero + std::ops::AddAssign,
-{
-    fn number_category() -> NumberCategory {
-        NumberCategory::Real
-    }
-}
-
 /// Scalar trait
 ///
 ///  - either a real (f64) or a dual number
@@ -185,7 +156,7 @@ pub trait IsScalar<const BATCH_SIZE: usize>:
     ///
     ///  - If self is a single scalar, the array must have one element
     ///  - If self is a batch scalar, the array must have BATCH_SIZE elements
-    ///  - If self is a dual number, the infinitesimal part is set to zero    
+    ///  - If self is a dual number, the infinitesimal part is set to zero
     fn from_real_array(arr: [f64; BATCH_SIZE]) -> Self;
 
     /// creates scalar from real scalar
@@ -426,282 +397,17 @@ impl IsSingleScalar for f64 {
     }
 }
 
-impl<const BATCH: usize> AbsDiffEq for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Epsilon = f64;
-
-    fn default_epsilon() -> f64 {
-        f64::default_epsilon()
-    }
-
-    fn abs_diff_eq(&self, other: &Self, epsilon: f64) -> bool {
-        for i in 0..BATCH {
-            if !self.0[i].abs_diff_eq(&other.0[i], epsilon) {
-                return false;
-            }
-        }
-        true
-    }
-}
-
-impl<const BATCH: usize> RelativeEq for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn default_max_relative() -> Self::Epsilon {
-        f64::default_max_relative()
-    }
-
-    fn relative_eq(
-        &self,
-        other: &Self,
-        epsilon: Self::Epsilon,
-        max_relative: Self::Epsilon,
-    ) -> bool {
-        for i in 0..BATCH {
-            if !self.0[i].relative_eq(&other.0[i], epsilon, max_relative) {
-                return false;
-            }
-        }
-        true
-    }
-}
-
-impl<const BATCH: usize> AddAssign for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn add_assign(&mut self, rhs: Self) {
-        self.0 += rhs.0;
-    }
-}
-
-impl<const BATCH: usize> SubAssign for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn sub_assign(&mut self, rhs: Self) {
-        self.0 -= rhs.0;
-    }
-}
-
-impl<const BATCH: usize> MulAssign for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn mul_assign(&mut self, rhs: Self) {
-        self.0 *= rhs.0;
-    }
-}
-
-impl<const BATCH: usize> Neg for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = Self;
-
-    fn neg(self) -> Self {
-        BatchScalarF64 { 0: -self.0 }
-    }
-}
-
-impl<const BATCH: usize> Sub for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = Self;
-
-    fn sub(self, rhs: Self) -> Self {
-        BatchScalarF64 { 0: self.0 - rhs.0 }
-    }
-}
-
-impl<const BATCH: usize> Mul for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = Self;
-
-    fn mul(self, rhs: Self) -> Self {
-        BatchScalarF64 { 0: self.0 * rhs.0 }
-    }
-}
-
-impl<const BATCH: usize> Div for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Output = Self;
-
-    fn div(self, rhs: Self) -> Self {
-        BatchScalarF64 { 0: self.0 / rhs.0 }
-    }
-}
-
-impl<const BATCH: usize> num_traits::One for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn one() -> Self {
-        Self(Simd::<f64, BATCH>::splat(1.0))
-    }
-}
-impl<const BATCH: usize> IsRealScalar<BATCH> for BatchScalarF64<BATCH> where
-    LaneCount<BATCH>: SupportedLaneCount
-{
-}
-
-impl<const BATCH: usize> IsBatchScalar<BATCH> for BatchScalarF64<BATCH> where
-    LaneCount<BATCH>: SupportedLaneCount
-{
-}
-
-impl<const BATCH: usize> IsScalar<BATCH> for BatchScalarF64<BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    type Scalar = BatchScalarF64<BATCH>;
-    type RealScalar = Self;
-    type SingleScalar = f64;
-    type DualScalar = DualBatchScalar<BATCH>;
-
-    type RealVector<const ROWS: usize> = BatchVecF64<ROWS, BATCH>;
-
-    type Vector<const ROWS: usize> = BatchVecF64<ROWS, BATCH>;
-    type Matrix<const ROWS: usize, const COLS: usize> = BatchMatF64<ROWS, COLS, BATCH>;
-    type RealMatrix<const ROWS: usize, const COLS: usize> = BatchMatF64<ROWS, COLS, BATCH>;
-
-    type Mask = Mask<i64, BATCH>;
-
-    fn less_equal(&self, rhs: &Self) -> Self::Mask {
-        self.0.simd_le(rhs.0)
-    }
-
-    fn greater_equal(&self, rhs: &Self) -> Self::Mask {
-        self.0.simd_ge(rhs.0)
-    }
-
-    fn scalar_examples() -> Vec<BatchScalarF64<BATCH>> {
-        vec![
-            BatchScalarF64::<BATCH>::from_f64(1.0),
-            BatchScalarF64::<BATCH>::from_f64(2.0),
-            BatchScalarF64::<BATCH>::from_f64(3.0),
-        ]
-    }
-
-    fn extract_single(&self, i: usize) -> f64 {
-        self.0[i]
-    }
-
-    fn from_real_scalar(val: BatchScalarF64<BATCH>) -> Self {
-        val
-    }
-
-    fn real_part(&self) -> Self {
-        *self
-    }
-
-    fn from_f64(val: f64) -> Self {
-        BatchScalarF64 {
-            0: Simd::<f64, BATCH>::splat(val),
-        }
-    }
-
-    fn abs(self) -> Self {
-        BatchScalarF64 {
-            0: SimdFloat::abs(self.0),
-        }
-    }
-
-    fn from_real_array(arr: [f64; BATCH]) -> Self {
-        BatchScalarF64 {
-            0: Simd::<f64, BATCH>::from_array(arr),
-        }
-    }
-
-    fn to_real_array(&self) -> [f64; BATCH] {
-        self.0.to_array()
-    }
-
-    fn cos(self) -> Self {
-        BatchScalarF64 { 0: self.0.cos() }
-    }
-
-    fn sin(self) -> Self {
-        BatchScalarF64 { 0: self.0.sin() }
-    }
-
-    fn tan(self) -> Self {
-        BatchScalarF64 {
-            0: sleef::Sleef::tan(self.0),
-        }
-    }
-
-    fn acos(self) -> Self {
-        BatchScalarF64 {
-            0: sleef::Sleef::acos(self.0),
-        }
-    }
-
-    fn asin(self) -> Self {
-        BatchScalarF64 {
-            0: sleef::Sleef::asin(self.0),
-        }
-    }
-
-    fn atan(self) -> Self {
-        BatchScalarF64 {
-            0: sleef::Sleef::atan(self.0),
-        }
-    }
-
-    fn sqrt(self) -> Self {
-        BatchScalarF64 { 0: self.0.sqrt() }
-    }
-
-    fn atan2(self, x: Self) -> Self {
-        BatchScalarF64 {
-            0: sleef::Sleef::atan2(self.0, x.0),
-        }
-    }
-
-    fn to_vec(self) -> Self::Vector<1> {
-        BatchVecF64::<1, BATCH>::from_scalar(self)
-    }
-
-    fn fract(self) -> Self {
-        BatchScalarF64 { 0: self.0.fract() }
-    }
-
-    fn floor(&self) -> BatchScalarF64<BATCH> {
-        BatchScalarF64 { 0: self.0.floor() }
-    }
-
-    fn signum(&self) -> Self {
-        BatchScalarF64 { 0: self.0.signum() }
-    }
-
-    type DualVector<const ROWS: usize> = DualBatchVector<ROWS, BATCH>;
-
-    type DualMatrix<const ROWS: usize, const COLS: usize> = DualBatchMatrix<ROWS, COLS, BATCH>;
-
-    fn to_dual(self) -> Self::DualScalar {
-        DualBatchScalar::from_real_scalar(self)
-    }
-
-    fn select(self, mask: &Self::Mask, other: Self) -> Self {
-        BatchScalarF64 {
-            0: mask.select(self.0, other.0),
-        }
-    }
-}
-
 #[test]
 fn scalar_prop_tests() {
+    #[cfg(feature = "simd")]
+    use crate::linalg::BatchScalarF64;
+
     f64::test_suite();
+
+    #[cfg(feature = "simd")]
     BatchScalarF64::<2>::test_suite();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<4>::test_suite();
+    #[cfg(feature = "simd")]
     BatchScalarF64::<8>::test_suite();
 }
diff --git a/crates/sophus_core/src/linalg/vector.rs b/crates/sophus_core/src/linalg/vector.rs
index a2682d9..de7cb38 100644
--- a/crates/sophus_core/src/linalg/vector.rs
+++ b/crates/sophus_core/src/linalg/vector.rs
@@ -1,8 +1,4 @@
-use crate::calculus::dual::DualBatchVector;
 use crate::calculus::dual::DualVector;
-use crate::linalg::BatchMatF64;
-use crate::linalg::BatchScalarF64;
-use crate::linalg::BatchVecF64;
 use crate::linalg::MatF64;
 use crate::linalg::VecF64;
 use crate::prelude::*;
@@ -14,9 +10,6 @@ use std::ops::Index;
 use std::ops::IndexMut;
 use std::ops::Neg;
 use std::ops::Sub;
-use std::simd::LaneCount;
-use std::simd::Mask;
-use std::simd::SupportedLaneCount;
 
 /// Vector - either a real (f64) or a dual number vector
 pub trait IsVector<S: IsScalar<BATCH_SIZE>, const ROWS: usize, const BATCH_SIZE: usize>:
@@ -253,114 +246,3 @@ pub fn cross<S: IsScalar<BATCH>, const BATCH: usize>(
         l0 * r1 - l1 * r0,
     ])
 }
-
-impl<const ROWS: usize, const BATCH: usize> IsVector<BatchScalarF64<BATCH>, ROWS, BATCH>
-    for BatchVecF64<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-    fn block_vec2<const R0: usize, const R1: usize>(
-        top_row: BatchVecF64<R0, BATCH>,
-        bot_row: BatchVecF64<R1, BATCH>,
-    ) -> Self {
-        assert_eq!(ROWS, R0 + R1);
-        let mut m = Self::zeros();
-
-        m.fixed_view_mut::<R0, 1>(0, 0).copy_from(&top_row);
-        m.fixed_view_mut::<R1, 1>(R0, 0).copy_from(&bot_row);
-        m
-    }
-
-    fn dot(self, rhs: Self) -> BatchScalarF64<BATCH> {
-        (self.transpose() * rhs)[0]
-    }
-
-    fn from_array(vals: [BatchScalarF64<BATCH>; ROWS]) -> Self {
-        Self::from_fn(|i, _| vals[i])
-    }
-
-    fn from_real_array(vals: [BatchScalarF64<BATCH>; ROWS]) -> Self {
-        Self::from_fn(|i, _| vals[i])
-    }
-
-    fn from_f64_array(vals: [f64; ROWS]) -> Self {
-        Self::from_fn(|i, _| BatchScalarF64::<BATCH>::from_f64(vals[i]))
-    }
-
-    fn from_scalar_array(vals: [BatchScalarF64<BATCH>; ROWS]) -> Self {
-        Self::from_fn(|i, _| vals[i])
-    }
-
-    fn from_real_vector(val: BatchVecF64<ROWS, BATCH>) -> Self {
-        val
-    }
-
-    fn get_elem(&self, idx: usize) -> BatchScalarF64<BATCH> {
-        self[idx]
-    }
-
-    fn norm(&self) -> BatchScalarF64<BATCH> {
-        self.squared_norm().sqrt()
-    }
-
-    fn normalized(&self) -> Self {
-        let norm = self.norm();
-        if norm == <BatchScalarF64<BATCH> as IsScalar<BATCH>>::zeros() {
-            return *self;
-        }
-        let factor = BatchScalarF64::<BATCH>::ones() / norm;
-        self * factor
-    }
-
-    fn real_vector(&self) -> &BatchVecF64<ROWS, BATCH> {
-        self
-    }
-
-    fn scaled(&self, v: BatchScalarF64<BATCH>) -> Self {
-        self * v
-    }
-
-    fn set_elem(&mut self, idx: usize, v: BatchScalarF64<BATCH>) {
-        self[idx] = v;
-    }
-
-    fn squared_norm(&self) -> BatchScalarF64<BATCH> {
-        let mut squared_norm = <BatchScalarF64<BATCH> as IsScalar<BATCH>>::zeros();
-        for i in 0..ROWS {
-            let val = IsVector::get_elem(self, i);
-            squared_norm += val * val;
-        }
-        squared_norm
-    }
-
-    fn to_mat(self) -> BatchMatF64<ROWS, 1, BATCH> {
-        self
-    }
-
-    fn from_f64(val: f64) -> Self {
-        Self::from_element(BatchScalarF64::<BATCH>::from_f64(val))
-    }
-
-    fn to_dual(self) -> <BatchScalarF64<BATCH> as IsScalar<BATCH>>::DualVector<ROWS> {
-        DualBatchVector::from_real_vector(self)
-    }
-
-    fn outer<const R2: usize>(self, rhs: BatchVecF64<R2, BATCH>) -> BatchMatF64<ROWS, R2, BATCH> {
-        self * rhs.transpose()
-    }
-
-    fn select(self, mask: &Mask<i64, BATCH>, other: Self) -> Self {
-        self.zip_map(&other, |a, b| a.select(mask, b))
-    }
-
-    fn get_fixed_subvec<const R: usize>(&self, start_r: usize) -> BatchVecF64<R, BATCH> {
-        self.fixed_rows::<R>(start_r).into()
-    }
-}
-
-impl<const ROWS: usize, const BATCH: usize> IsRealVector<BatchScalarF64<BATCH>, ROWS, BATCH>
-    for BatchVecF64<ROWS, BATCH>
-where
-    LaneCount<BATCH>: SupportedLaneCount,
-{
-}
diff --git a/crates/sophus_core/src/tensor/element.rs b/crates/sophus_core/src/tensor/element.rs
index 4ae17a4..4f22f6f 100644
--- a/crates/sophus_core/src/tensor/element.rs
+++ b/crates/sophus_core/src/tensor/element.rs
@@ -149,10 +149,15 @@ impl STensorFormat {
 
 #[test]
 fn test_elements() {
+    #[cfg(feature = "simd")]
     use crate::linalg::scalar::IsScalar;
     use crate::linalg::scalar::NumberCategory;
+
+    #[cfg(feature = "simd")]
     use crate::linalg::BatchScalar;
+    #[cfg(feature = "simd")]
     use crate::linalg::BatchScalarF64;
+    #[cfg(feature = "simd")]
     use crate::linalg::BatchVecF64;
 
     use crate::linalg::VecF32;
@@ -160,6 +165,7 @@ fn test_elements() {
     assert_eq!(f32::number_category(), NumberCategory::Real);
     assert_eq!(u32::number_category(), NumberCategory::Unsigned);
     assert_eq!(i32::number_category(), NumberCategory::Signed);
+    #[cfg(feature = "simd")]
     assert_eq!(
         BatchScalar::<f64, 4>::number_category(),
         NumberCategory::Real
@@ -180,8 +186,11 @@ fn test_elements() {
     assert_eq!(mat.scalar([1, 1]), &4.0);
     assert_abs_diff_eq!(mat, SMat::<f32, 2, 2>::new(1.0, 2.0, 3.0, 4.0));
 
-    let batch_vec: BatchVecF64<2, 2> =
-        BatchVecF64::from_element(BatchScalarF64::from_real_array([1.0, 2.0]));
-    assert_eq!(batch_vec.scalar([0]).extract_single(0), 1.0);
-    assert_eq!(batch_vec.scalar([1]).extract_single(1), 2.0);
+    #[cfg(feature = "simd")]
+    {
+        let batch_vec: BatchVecF64<2, 2> =
+            BatchVecF64::from_element(BatchScalarF64::from_real_array([1.0, 2.0]));
+        assert_eq!(batch_vec.scalar([0]).extract_single(0), 1.0);
+        assert_eq!(batch_vec.scalar([1]).extract_single(1), 2.0);
+    }
 }
diff --git a/crates/sophus_core/src/tensor/mut_tensor.rs b/crates/sophus_core/src/tensor/mut_tensor.rs
index 39d6acc..222f586 100644
--- a/crates/sophus_core/src/tensor/mut_tensor.rs
+++ b/crates/sophus_core/src/tensor/mut_tensor.rs
@@ -438,6 +438,7 @@ mut_tensor_is_view!(5, 2, 3);
 
 #[test]
 fn mut_tensor_tests() {
+    #[cfg(feature = "simd")]
     use crate::linalg::BatchMatF64;
     {
         let _rank1_tensor = MutTensorD::<u8>::default();
@@ -514,6 +515,7 @@ fn mut_tensor_tests() {
     }
 
     //linalg
+    #[cfg(feature = "simd")]
     {
         let shape = [3];
 
diff --git a/crates/sophus_image/Cargo.toml b/crates/sophus_image/Cargo.toml
index 141cea2..d559a4b 100644
--- a/crates/sophus_image/Cargo.toml
+++ b/crates/sophus_image/Cargo.toml
@@ -20,3 +20,6 @@ nalgebra.workspace = true
 ndarray.workspace = true
 num-traits.workspace = true
 png.workspace = true
+
+[features]
+simd = ["sophus_core/simd"]
diff --git a/crates/sophus_image/src/intensity_image/dyn_intensity_image.rs b/crates/sophus_image/src/intensity_image/dyn_intensity_image.rs
index 46d1ac6..4001b5d 100644
--- a/crates/sophus_image/src/intensity_image/dyn_intensity_image.rs
+++ b/crates/sophus_image/src/intensity_image/dyn_intensity_image.rs
@@ -179,7 +179,7 @@ pub enum DynIntensityArcImage {
     RgbaF32(ArcImage4F32),
 }
 
-/// Convert a GenMutImage to an GenArcImage  
+/// Convert a GenMutImage to an GenArcImage
 ///
 impl From<DynIntensityMutImage> for DynIntensityArcImage {
     fn from(image: DynIntensityMutImage) -> Self {
diff --git a/crates/sophus_image/src/lib.rs b/crates/sophus_image/src/lib.rs
index f237429..9333cd4 100644
--- a/crates/sophus_image/src/lib.rs
+++ b/crates/sophus_image/src/lib.rs
@@ -1,4 +1,4 @@
-#![feature(portable_simd)]
+#![cfg_attr(feature = "simd", feature(portable_simd))]
 #![deny(missing_docs)]
 //! image crate - part of the sophus-rs project
 
diff --git a/crates/sophus_lie/Cargo.toml b/crates/sophus_lie/Cargo.toml
index 11bc3c6..c0888fb 100644
--- a/crates/sophus_lie/Cargo.toml
+++ b/crates/sophus_lie/Cargo.toml
@@ -11,9 +11,14 @@ repository.workspace = true
 version.workspace = true
 
 [dependencies]
-sophus_core.workspace = true
 
 approx.workspace = true
 assertables.workspace = true
 nalgebra.workspace = true
 num-traits.workspace = true
+
+sophus_core = {workspace=true}
+
+
+[features]
+simd = ["sophus_core/simd"]
diff --git a/crates/sophus_lie/src/factor_lie_group.rs b/crates/sophus_lie/src/factor_lie_group.rs
index 9dab293..8816114 100644
--- a/crates/sophus_lie/src/factor_lie_group.rs
+++ b/crates/sophus_lie/src/factor_lie_group.rs
@@ -4,12 +4,16 @@ use crate::traits::IsRealLieFactorGroupImpl;
 use crate::Rotation2;
 use crate::Rotation3;
 use approx::assert_relative_eq;
-use sophus_core::calculus::dual::DualBatchScalar;
 use sophus_core::calculus::dual::DualScalar;
 use sophus_core::calculus::maps::MatrixValuedMapFromVector;
-use sophus_core::linalg::BatchScalarF64;
 use sophus_core::manifold::traits::TangentImpl;
 
+#[cfg(feature = "simd")]
+use sophus_core::calculus::dual::DualBatchScalar;
+
+#[cfg(feature = "simd")]
+use sophus_core::linalg::BatchScalarF64;
+
 impl<
         S: IsRealScalar<BATCH_SIZE, RealScalar = S>,
         const DOF: usize,
@@ -154,6 +158,7 @@ macro_rules! def_real_group_test_template {
 }
 
 def_real_group_test_template!(f64, DualScalar, Rotation2<f64, 1>, Rotation2<DualScalar, 1>,  1);
+#[cfg(feature = "simd")]
 def_real_group_test_template!(
     BatchScalarF64<8>,
     DualBatchScalar<8>,
@@ -163,6 +168,7 @@ def_real_group_test_template!(
 );
 
 def_real_group_test_template!(f64, DualScalar, Rotation3<f64, 1>, Rotation3<DualScalar, 1>,  1);
+#[cfg(feature = "simd")]
 def_real_group_test_template!(
     BatchScalarF64<8>,
     DualBatchScalar<8>,
diff --git a/crates/sophus_lie/src/groups/isometry2.rs b/crates/sophus_lie/src/groups/isometry2.rs
index 61825da..7a21157 100644
--- a/crates/sophus_lie/src/groups/isometry2.rs
+++ b/crates/sophus_lie/src/groups/isometry2.rs
@@ -34,15 +34,21 @@ impl<S: IsScalar<BATCH>, const BATCH: usize> Isometry2<S, BATCH> {
 #[test]
 fn isometry2_prop_tests() {
     use crate::real_lie_group::RealLieGroupTest;
-    use sophus_core::calculus::dual::dual_scalar::DualBatchScalar;
     use sophus_core::calculus::dual::dual_scalar::DualScalar;
+
+    #[cfg(feature = "simd")]
+    use sophus_core::calculus::dual::dual_batch_scalar::DualBatchScalar;
+    #[cfg(feature = "simd")]
     use sophus_core::linalg::BatchScalarF64;
 
     Isometry2::<f64, 1>::test_suite();
+    #[cfg(feature = "simd")]
     Isometry2::<BatchScalarF64<8>, 8>::test_suite();
     Isometry2::<DualScalar, 1>::test_suite();
+    #[cfg(feature = "simd")]
     Isometry2::<DualBatchScalar<8>, 8>::test_suite();
 
     Isometry2::<f64, 1>::run_real_tests();
+    #[cfg(feature = "simd")]
     Isometry2::<BatchScalarF64<8>, 8>::run_real_tests();
 }
diff --git a/crates/sophus_lie/src/groups/isometry3.rs b/crates/sophus_lie/src/groups/isometry3.rs
index e14cf50..4a650e6 100644
--- a/crates/sophus_lie/src/groups/isometry3.rs
+++ b/crates/sophus_lie/src/groups/isometry3.rs
@@ -33,15 +33,21 @@ impl<S: IsScalar<BATCH>, const BATCH: usize> Isometry3<S, BATCH> {
 #[test]
 fn isometry3_prop_tests() {
     use crate::real_lie_group::RealLieGroupTest;
-    use sophus_core::calculus::dual::dual_scalar::DualBatchScalar;
     use sophus_core::calculus::dual::dual_scalar::DualScalar;
+
+    #[cfg(feature = "simd")]
+    use sophus_core::calculus::dual::DualBatchScalar;
+    #[cfg(feature = "simd")]
     use sophus_core::linalg::BatchScalarF64;
 
     Isometry3::<f64, 1>::test_suite();
+    #[cfg(feature = "simd")]
     Isometry3::<BatchScalarF64<8>, 8>::test_suite();
     Isometry3::<DualScalar, 1>::test_suite();
+    #[cfg(feature = "simd")]
     Isometry3::<DualBatchScalar<8>, 8>::test_suite();
 
     Isometry3::<f64, 1>::run_real_tests();
+    #[cfg(feature = "simd")]
     Isometry3::<BatchScalarF64<8>, 8>::run_real_tests();
 }
diff --git a/crates/sophus_lie/src/groups/rotation2.rs b/crates/sophus_lie/src/groups/rotation2.rs
index 7ca396c..665cb29 100644
--- a/crates/sophus_lie/src/groups/rotation2.rs
+++ b/crates/sophus_lie/src/groups/rotation2.rs
@@ -298,18 +298,27 @@ impl<S: IsRealScalar<BATCH_SIZE>, const BATCH_SIZE: usize>
 fn rotation2_prop_tests() {
     use crate::factor_lie_group::RealFactorLieGroupTest;
     use crate::real_lie_group::RealLieGroupTest;
-    use sophus_core::calculus::dual::dual_scalar::DualBatchScalar;
     use sophus_core::calculus::dual::dual_scalar::DualScalar;
+
+    #[cfg(feature = "simd")]
+    use sophus_core::calculus::dual::DualBatchScalar;
+
+    #[cfg(feature = "simd")]
     use sophus_core::linalg::BatchScalarF64;
 
     Rotation2::<f64, 1>::test_suite();
+    #[cfg(feature = "simd")]
     Rotation2::<BatchScalarF64<8>, 8>::test_suite();
+
     Rotation2::<DualScalar, 1>::test_suite();
+    #[cfg(feature = "simd")]
     Rotation2::<DualBatchScalar<8>, 8>::test_suite();
 
     Rotation2::<f64, 1>::run_real_tests();
+    #[cfg(feature = "simd")]
     Rotation2::<BatchScalarF64<8>, 8>::run_real_tests();
 
     Rotation2::<f64, 1>::run_real_factor_tests();
+    #[cfg(feature = "simd")]
     Rotation2::<BatchScalarF64<8>, 8>::run_real_factor_tests();
 }
diff --git a/crates/sophus_lie/src/groups/rotation3.rs b/crates/sophus_lie/src/groups/rotation3.rs
index 5554b76..bd65171 100644
--- a/crates/sophus_lie/src/groups/rotation3.rs
+++ b/crates/sophus_lie/src/groups/rotation3.rs
@@ -603,18 +603,24 @@ pub type Rotation3<S, const BATCH: usize> = LieGroup<S, 3, 4, 3, 3, BATCH, Rotat
 fn rotation3_prop_tests() {
     use crate::factor_lie_group::RealFactorLieGroupTest;
     use crate::real_lie_group::RealLieGroupTest;
-    use sophus_core::calculus::dual::dual_scalar::DualBatchScalar;
     use sophus_core::calculus::dual::dual_scalar::DualScalar;
+    #[cfg(feature = "simd")]
+    use sophus_core::calculus::dual::DualBatchScalar;
+    #[cfg(feature = "simd")]
     use sophus_core::linalg::BatchScalarF64;
 
     Rotation3::<f64, 1>::test_suite();
+    #[cfg(feature = "simd")]
     Rotation3::<BatchScalarF64<8>, 8>::test_suite();
     Rotation3::<DualScalar, 1>::test_suite();
+    #[cfg(feature = "simd")]
     Rotation3::<DualBatchScalar<8>, 8>::test_suite();
 
     Rotation3::<f64, 1>::run_real_tests();
+    #[cfg(feature = "simd")]
     Rotation3::<BatchScalarF64<8>, 8>::run_real_tests();
 
     Rotation3::<f64, 1>::run_real_factor_tests();
+    #[cfg(feature = "simd")]
     Rotation3::<BatchScalarF64<8>, 8>::run_real_factor_tests();
 }
diff --git a/crates/sophus_lie/src/lib.rs b/crates/sophus_lie/src/lib.rs
index 8e6e5fa..38df283 100644
--- a/crates/sophus_lie/src/lib.rs
+++ b/crates/sophus_lie/src/lib.rs
@@ -1,4 +1,4 @@
-#![feature(portable_simd)]
+#![cfg_attr(feature = "simd", feature(portable_simd))]
 #![deny(missing_docs)]
 #![allow(clippy::needless_range_loop)]
 //! Lie groups crate - part of the sophus-rs project
diff --git a/crates/sophus_lie/src/real_lie_group.rs b/crates/sophus_lie/src/real_lie_group.rs
index 2b7dbfb..f264e4f 100644
--- a/crates/sophus_lie/src/real_lie_group.rs
+++ b/crates/sophus_lie/src/real_lie_group.rs
@@ -8,12 +8,16 @@ use crate::Rotation2;
 use crate::Rotation3;
 use approx::assert_relative_eq;
 use nalgebra::SVector;
-use sophus_core::calculus::dual::DualBatchScalar;
 use sophus_core::calculus::dual::DualScalar;
 use sophus_core::calculus::maps::MatrixValuedMapFromVector;
 use sophus_core::calculus::maps::VectorValuedMapFromMatrix;
 use sophus_core::calculus::maps::VectorValuedMapFromVector;
+
+#[cfg(feature = "simd")]
+use sophus_core::calculus::dual::dual_batch_scalar::DualBatchScalar;
+#[cfg(feature = "simd")]
 use sophus_core::linalg::BatchScalarF64;
+
 use std::fmt::Display;
 use std::fmt::Formatter;
 
@@ -513,6 +517,7 @@ macro_rules! def_real_group_test_template {
 }
 
 def_real_group_test_template!(f64, DualScalar, Rotation2<f64, 1>, Rotation2<DualScalar, 1>,  1);
+#[cfg(feature = "simd")]
 def_real_group_test_template!(
     BatchScalarF64<8>,
     DualBatchScalar<8>,
@@ -522,6 +527,7 @@ def_real_group_test_template!(
 );
 
 def_real_group_test_template!(f64, DualScalar, Isometry2<f64, 1>, Isometry2<DualScalar, 1>,  1);
+#[cfg(feature = "simd")]
 def_real_group_test_template!(
     BatchScalarF64<8>,
     DualBatchScalar<8>,
@@ -531,6 +537,7 @@ def_real_group_test_template!(
 );
 
 def_real_group_test_template!(f64, DualScalar, Rotation3<f64, 1>, Rotation3<DualScalar, 1>,  1);
+#[cfg(feature = "simd")]
 def_real_group_test_template!(
     BatchScalarF64<8>,
     DualBatchScalar<8>,
@@ -540,6 +547,7 @@ def_real_group_test_template!(
 );
 
 def_real_group_test_template!(f64, DualScalar, Isometry3<f64, 1>, Isometry3<DualScalar, 1>,  1);
+#[cfg(feature = "simd")]
 def_real_group_test_template!(
     BatchScalarF64<8>,
     DualBatchScalar<8>,
diff --git a/crates/sophus_opt/Cargo.toml b/crates/sophus_opt/Cargo.toml
index 0db3d4c..5cda437 100644
--- a/crates/sophus_opt/Cargo.toml
+++ b/crates/sophus_opt/Cargo.toml
@@ -24,3 +24,11 @@ faer.workspace = true
 nalgebra.workspace = true
 ndarray.workspace = true
 rand.workspace = true
+
+[features]
+simd = [
+    "sophus_core/simd",
+    "sophus_lie/simd",
+    "sophus_sensor/simd",
+    "sophus_image/simd",
+]
diff --git a/crates/sophus_opt/src/cost_fn.rs b/crates/sophus_opt/src/cost_fn.rs
index 9aa4d6e..7b3e0ea 100644
--- a/crates/sophus_opt/src/cost_fn.rs
+++ b/crates/sophus_opt/src/cost_fn.rs
@@ -23,7 +23,7 @@ pub trait IsTermSignature<const N: usize> {
     fn idx_ref(&self) -> &[usize; N];
 }
 
-/// Signature of a cost function    
+/// Signature of a cost function
 #[derive(Debug, Clone)]
 pub struct CostSignature<
     const NUM_ARGS: usize,
diff --git a/crates/sophus_opt/src/lib.rs b/crates/sophus_opt/src/lib.rs
index 2396f0a..a7b9a08 100644
--- a/crates/sophus_opt/src/lib.rs
+++ b/crates/sophus_opt/src/lib.rs
@@ -1,4 +1,4 @@
-#![feature(portable_simd)]
+#![cfg_attr(feature = "simd", feature(portable_simd))]
 #![deny(missing_docs)]
 #![allow(clippy::needless_range_loop)]
 
diff --git a/crates/sophus_opt/src/variables.rs b/crates/sophus_opt/src/variables.rs
index 3c9e546..8df0fb9 100644
--- a/crates/sophus_opt/src/variables.rs
+++ b/crates/sophus_opt/src/variables.rs
@@ -277,7 +277,7 @@ impl<Var: IsVariable + 'static> IsVarFamily for VarFamily<Var> {
         assert_eq!(
             self.start_indices.len(),
             0,
-            "Ths function must ony called once"
+            "This function must only called once"
         );
 
         match self.get_var_kind() {
diff --git a/crates/sophus_pyo3/src/pyo3/lie_groups.rs b/crates/sophus_pyo3/src/pyo3/lie_groups.rs
index e45c193..9455176 100644
--- a/crates/sophus_pyo3/src/pyo3/lie_groups.rs
+++ b/crates/sophus_pyo3/src/pyo3/lie_groups.rs
@@ -404,14 +404,14 @@ macro_rules! augment_py_product_group_class {
 
 augment_py_product_group_class!(
     PyBaseIsometry2,
-    PyIsometry2, 
-    Isometry2<f64, 1>, 
+    PyIsometry2,
+    Isometry2<f64, 1>,
     PyRotation2,
-    "Isometry2", 
+    "Isometry2",
     2);
 augment_py_product_group_class!(
     PyBaseIsometry3,
     PyIsometry3,
-    Isometry3<f64, 1>, 
-    PyRotation3, 
+    Isometry3<f64, 1>,
+    PyRotation3,
     "Isometry3",3);
diff --git a/crates/sophus_sensor/Cargo.toml b/crates/sophus_sensor/Cargo.toml
index aab90c6..a6fe699 100644
--- a/crates/sophus_sensor/Cargo.toml
+++ b/crates/sophus_sensor/Cargo.toml
@@ -19,3 +19,6 @@ assertables.workspace = true
 nalgebra.workspace = true
 ndarray.workspace = true
 num-traits.workspace = true
+
+[features]
+simd = ["sophus_core/simd", "sophus_image/simd"]
diff --git a/crates/sophus_sensor/src/lib.rs b/crates/sophus_sensor/src/lib.rs
index cea326c..867f71e 100644
--- a/crates/sophus_sensor/src/lib.rs
+++ b/crates/sophus_sensor/src/lib.rs
@@ -1,4 +1,4 @@
-#![feature(portable_simd)]
+#![cfg_attr(feature = "simd", feature(portable_simd))]
 #![deny(missing_docs)]
 
 //! Sensor (aka camera) crate - part of the sophus-rs project
diff --git a/crates/sophus_viewer/Cargo.toml b/crates/sophus_viewer/Cargo.toml
index 26bbc60..7c3b160 100644
--- a/crates/sophus_viewer/Cargo.toml
+++ b/crates/sophus_viewer/Cargo.toml
@@ -28,3 +28,11 @@ nalgebra.workspace = true
 ndarray.workspace = true
 tokio.workspace = true
 wgpu.workspace = true
+
+[features]
+simd = [
+    "sophus_core/simd",
+    "sophus_lie/simd",
+    "sophus_sensor/simd",
+    "sophus_image/simd",
+]
diff --git a/crates/sophus_viewer/src/lib.rs b/crates/sophus_viewer/src/lib.rs
index 7d779ec..9438296 100644
--- a/crates/sophus_viewer/src/lib.rs
+++ b/crates/sophus_viewer/src/lib.rs
@@ -1,4 +1,4 @@
-#![feature(portable_simd)]
+#![cfg_attr(feature = "simd", feature(portable_simd))]
 #![deny(missing_docs)]
 #![allow(clippy::needless_range_loop)]
 
diff --git a/crates/sophus_viewer/src/pixel_renderer.rs b/crates/sophus_viewer/src/pixel_renderer.rs
index 386a735..92a8512 100644
--- a/crates/sophus_viewer/src/pixel_renderer.rs
+++ b/crates/sophus_viewer/src/pixel_renderer.rs
@@ -34,14 +34,6 @@ struct OrthoCam {
     dummy1: f32,
 }
 
-/// 2D vertex
-#[repr(C)]
-#[derive(Clone, Copy, Pod, Zeroable)]
-pub struct Vertex2 {
-    _pos: [f32; 2],
-    _color: [f32; 4],
-}
-
 /// 2D line vertex
 #[repr(C)]
 #[derive(Clone, Copy, Pod, Zeroable)]
diff --git a/crates/sophus_viewer/src/pixel_renderer/line_pixel_shader.wgsl b/crates/sophus_viewer/src/pixel_renderer/line_pixel_shader.wgsl
index 8bb9244..50b8b0b 100644
--- a/crates/sophus_viewer/src/pixel_renderer/line_pixel_shader.wgsl
+++ b/crates/sophus_viewer/src/pixel_renderer/line_pixel_shader.wgsl
@@ -45,8 +45,8 @@ fn vs_main(
     }
 
     out.position = vec4<f32>(2.0 * (p.x+0.5) / uniforms.width_height.x - 1.0,
-                             2.0 - 2.0*(p.y+0.5) / uniforms.width_height.y - 1.0, 
-                             0.0, 
+                             2.0 - 2.0*(p.y+0.5) / uniforms.width_height.y - 1.0,
+                             0.0,
                              1.0);
     out.color = color;
 
diff --git a/crates/sophus_viewer/src/pixel_renderer/point_pixel_shader.wgsl b/crates/sophus_viewer/src/pixel_renderer/point_pixel_shader.wgsl
index 7626d22..bd1af7f 100644
--- a/crates/sophus_viewer/src/pixel_renderer/point_pixel_shader.wgsl
+++ b/crates/sophus_viewer/src/pixel_renderer/point_pixel_shader.wgsl
@@ -40,8 +40,8 @@ fn vs_main(
     }
 
     out.position = vec4<f32>(2.0 * (u+0.5) / uniforms.width_height.x - 1.0,
-                             2.0 - 2.0*(v+0.5) / uniforms.width_height.y - 1.0, 
-                             0.0, 
+                             2.0 - 2.0*(v+0.5) / uniforms.width_height.y - 1.0,
+                             0.0,
                              1.0);
     out.color = color;
 
diff --git a/crates/sophus_viewer/src/scene_renderer/line_scene_shader.wgsl b/crates/sophus_viewer/src/scene_renderer/line_scene_shader.wgsl
index b867f51..310096b 100644
--- a/crates/sophus_viewer/src/scene_renderer/line_scene_shader.wgsl
+++ b/crates/sophus_viewer/src/scene_renderer/line_scene_shader.wgsl
@@ -63,7 +63,7 @@ fn vs_main(
         uv = uv0 - line_half_width * n;
         z = uv_z0.z;
     }
-       
+
 
     // map point from pixel coordinates (Computer Vision convention) to clip space coordinates (WebGPU convention)
     out.position = pixel_and_z_to_clip(uv, z, frustum_uniforms);
diff --git a/crates/sophus_viewer/src/scene_renderer/textured_mesh_scene_shader.wgsl b/crates/sophus_viewer/src/scene_renderer/textured_mesh_scene_shader.wgsl
index fdb8022..e3ac696 100644
--- a/crates/sophus_viewer/src/scene_renderer/textured_mesh_scene_shader.wgsl
+++ b/crates/sophus_viewer/src/scene_renderer/textured_mesh_scene_shader.wgsl
@@ -16,17 +16,17 @@ var<uniform> lut_uniform: DistortionLut;
 var distortion_texture: texture_2d<f32>;
 
 @group(2) @binding(0)
-var mesh_texture: texture_2d<f32>;  
+var mesh_texture: texture_2d<f32>;
 
 @group(2) @binding(1)
-var mesh_texture_sampler: sampler;  
+var mesh_texture_sampler: sampler;
 
 
 
 @vertex
 fn vs_main(
     @location(0) position: vec3<f32>,
-    @location(1) texCoords: vec2<f32>  
+    @location(1) texCoords: vec2<f32>
 ) -> VertexOut {
     var out: VertexOut;
     var uv_z = scene_point_to_distorted(position, view_uniform, frustum_uniforms, lut_uniform);
@@ -37,7 +37,7 @@ fn vs_main(
 
 @fragment
 fn fs_main(in: VertexOut) -> @location(0) vec4<f32> {
-    return textureSample(mesh_texture, mesh_texture_sampler, in.texCoords); 
+    return textureSample(mesh_texture, mesh_texture_sampler, in.texCoords);
    // return vec4<f32>(1.0, 0.0, 0.0, 1.0);
 }
 
diff --git a/crates/sophus_viewer/src/scene_renderer/utils.wgsl b/crates/sophus_viewer/src/scene_renderer/utils.wgsl
index 83c31fe..6ce9c45 100644
--- a/crates/sophus_viewer/src/scene_renderer/utils.wgsl
+++ b/crates/sophus_viewer/src/scene_renderer/utils.wgsl
@@ -45,12 +45,12 @@ fn z1_plane_to_distorted(point_in_proj: vec3<f32>, frustum: Frustum, lut: Distor
     var lut_offset_y = lut.lut_offset_y;
     var lut_range_x = lut.lut_range_x;
     var lut_range_y = lut.lut_range_y;
- 
+
     let u = clamp((width - 1.0) * (point_in_proj.x -lut_offset_x) / lut_range_x, 0.0, width - 1.00001);
     let v = clamp((height - 1.0) * (point_in_proj.y -lut_offset_y) / lut_range_y, 0.0, height - 1.00001);
 
     // Manual implementation of bilinear interpolation.
-    // This is to workaround apparent limitations of wgpu - such as no/limited support for 
+    // This is to workaround apparent limitations of wgpu - such as no/limited support for
     // sampling of f32 textures and sampling in the vertex shader.
     // TDDO: Figure out how to use sampling in vertex shader or maybe undistort in fragment shader
     //       (first render pinhole image to texture, then undistort in fragment shader).
@@ -81,17 +81,15 @@ fn scene_point_to_distorted(scene_point: vec3<f32>,
 // map point from pixel coordinates (Computer Vision convention) to clip space coordinates (WebGPU convention)
 fn pixel_and_z_to_clip(uv_z: vec2<f32>, z: f32, frustum: Frustum) -> vec4<f32> {
     var width = frustum.width;
-    var height = frustum.height;    
+    var height = frustum.height;
     var near = frustum.near;
     var far = frustum.far;
     var u = uv_z.x;
     var v = uv_z.y;
- 
-    return vec4<f32>(2.0 * (u / width - 0.5), 
-                    -2.0 * (v / height - 0.5), 
+
+    return vec4<f32>(2.0 * (u / width - 0.5),
+                    -2.0 * (v / height - 0.5),
                     // todo: Check whether the z value is correct.
                     (far * (z - near)) / (z * (far - near)),
                     1.0);
 }
-
-
diff --git a/publish_all.sh b/publish_all.sh
new file mode 100644
index 0000000..7e6c975
--- /dev/null
+++ b/publish_all.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+set -x # echo on
+set -e # exit on error
+
+cargo publish -p sophus_core
+cargo publish -p sophus_lie
+cargo publish -p sophus_pyo3
+cargo publish -p sophus_image
+cargo publish -p sophus_sensor
+cargo publish -p sophus_opt
+cargo publish -p sophus_viewer
+cargo publish -p sophus
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
deleted file mode 100644
index 5d56faf..0000000
--- a/rust-toolchain.toml
+++ /dev/null
@@ -1,2 +0,0 @@
-[toolchain]
-channel = "nightly"
diff --git a/rustfmt.toml b/rustfmt.toml
index 74c8af6..63df91e 100644
--- a/rustfmt.toml
+++ b/rustfmt.toml
@@ -1 +1 @@
-imports_granularity = "Item"
\ No newline at end of file
+imports_granularity = "Item"
diff --git a/sophus-rs.code-workspace b/sophus-rs.code-workspace
index 0fc68ee..adbf56c 100644
--- a/sophus-rs.code-workspace
+++ b/sophus-rs.code-workspace
@@ -13,5 +13,7 @@
 		"rust-analyzer.imports.granularity.group": "item",
 		"rust-analyzer.imports.prefix": "crate",
 		"rust-analyzer.cargo.unsetTest": [],
+		"rust-analyzer.cargo.features": [
+		],
 	}
 }