diff --git a/crates/core_arch/src/aarch64/mod.rs b/crates/core_arch/src/aarch64/mod.rs
index d573e2c0b8..4821438e9f 100644
--- a/crates/core_arch/src/aarch64/mod.rs
+++ b/crates/core_arch/src/aarch64/mod.rs
@@ -18,6 +18,8 @@ pub use self::crypto::*;
 mod crc;
 pub use self::crc::*;
 
+pub use super::acle::*;
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
diff --git a/crates/core_arch/src/acle/barrier/common.rs b/crates/core_arch/src/acle/barrier/common.rs
new file mode 100644
index 0000000000..0fb35534d1
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/common.rs
@@ -0,0 +1,14 @@
+//! Access types available on all architectures
+
+/// Full system is the required shareability domain, reads and writes are the
+/// required access types
+pub struct SY;
+
+dmb_dsb!(SY);
+
+impl super::super::sealed::Isb for SY {
+    #[inline(always)]
+    unsafe fn __isb(&self) {
+        super::isb(super::arg::SY)
+    }
+}
diff --git a/crates/core_arch/src/acle/barrier/cp15.rs b/crates/core_arch/src/acle/barrier/cp15.rs
new file mode 100644
index 0000000000..7938acbbb4
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/cp15.rs
@@ -0,0 +1,27 @@
+// Reference: ARM11 MPCore Processor Technical Reference Manual (ARM DDI 0360E) Section 3.5 "Summary
+// of CP15 instructions"
+
+/// Full system is the required shareability domain, reads and writes are the
+/// required access types
+pub struct SY;
+
+impl super::super::sealed::Dmb for SY {
+    #[inline(always)]
+    unsafe fn __dmb(&self) {
+        asm!("mcr p15, 0, r0, c7, c10, 5" : : : "memory" : "volatile")
+    }
+}
+
+impl super::super::sealed::Dsb for SY {
+    #[inline(always)]
+    unsafe fn __dsb(&self) {
+        asm!("mcr p15, 0, r0, c7, c10, 4" : : : "memory" : "volatile")
+    }
+}
+
+impl super::super::sealed::Isb for SY {
+    #[inline(always)]
+    unsafe fn __isb(&self) {
+        asm!("mcr p15, 0, r0, c7, c5, 4" : : : "memory" : "volatile")
+    }
+}
diff --git a/crates/core_arch/src/acle/barrier/mod.rs b/crates/core_arch/src/acle/barrier/mod.rs
new file mode 100644
index 0000000000..b3cbf44d27
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/mod.rs
@@ -0,0 +1,154 @@
+// Reference: Section 7.4 "Hints" of ACLE
+
+// CP15 instruction
+#[cfg(not(any(
+    // v8
+    target_arch = "aarch64",
+    // v7
+    target_feature = "v7",
+    // v6-M
+    target_feature = "mclass"
+)))]
+mod cp15;
+
+#[cfg(not(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+)))]
+pub use self::cp15::*;
+
+// Dedicated instructions
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+macro_rules! dmb_dsb {
+    ($A:ident) => {
+        impl super::super::sealed::Dmb for $A {
+            #[inline(always)]
+            unsafe fn __dmb(&self) {
+                super::dmb(super::arg::$A)
+            }
+        }
+
+        impl super::super::sealed::Dsb for $A {
+            #[inline(always)]
+            unsafe fn __dsb(&self) {
+                super::dsb(super::arg::$A)
+            }
+        }
+    };
+}
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+mod common;
+
+#[cfg(any(
+    target_arch = "aarch64",
+    target_feature = "v7",
+    target_feature = "mclass"
+))]
+pub use self::common::*;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7",))]
+mod not_mclass;
+
+#[cfg(any(target_arch = "aarch64", target_feature = "v7",))]
+pub use self::not_mclass::*;
+
+#[cfg(target_arch = "aarch64")]
+mod v8;
+
+#[cfg(target_arch = "aarch64")]
+pub use self::v8::*;
+
+/// Generates a DMB (data memory barrier) instruction or equivalent CP15 instruction.
+///
+/// DMB ensures the observed ordering of memory accesses. Memory accesses of the specified type
+/// issued before the DMB are guaranteed to be observed (in the specified scope) before memory
+/// accesses issued after the DMB.
+///
+/// For example, DMB should be used between storing data, and updating a flag variable that makes
+/// that data available to another core.
+///
+/// The __dmb() intrinsic also acts as a compiler memory barrier of the appropriate type.
+#[inline(always)]
+pub unsafe fn __dmb<A>(arg: A)
+where
+    A: super::sealed::Dmb,
+{
+    arg.__dmb()
+}
+
+/// Generates a DSB (data synchronization barrier) instruction or equivalent CP15 instruction.
+///
+/// DSB ensures the completion of memory accesses. A DSB behaves as the equivalent DMB and has
+/// additional properties. After a DSB instruction completes, all memory accesses of the specified
+/// type issued before the DSB are guaranteed to have completed.
+///
+/// The __dsb() intrinsic also acts as a compiler memory barrier of the appropriate type.
+#[inline(always)]
+pub unsafe fn __dsb<A>(arg: A)
+where
+    A: super::sealed::Dsb,
+{
+    arg.__dsb()
+}
+
+/// Generates an ISB (instruction synchronization barrier) instruction or equivalent CP15
+/// instruction.
+///
+/// This instruction flushes the processor pipeline fetch buffers, so that following instructions
+/// are fetched from cache or memory.
+///
+/// An ISB is needed after some system maintenance operations. An ISB is also needed before
+/// transferring control to code that has been loaded or modified in memory, for example by an
+/// overlay mechanism or just-in-time code generator.  (Note that if instruction and data caches are
+/// separate, privileged cache maintenance operations would be needed in order to unify the caches.)
+///
+/// The only supported argument for the __isb() intrinsic is 15, corresponding to the SY (full
+/// system) scope of the ISB instruction.
+#[inline(always)]
+pub unsafe fn __isb<A>(arg: A)
+where
+    A: super::sealed::Isb,
+{
+    arg.__isb()
+}
+
+extern "C" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.dmb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.dmb")]
+    fn dmb(_: i32);
+
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.dsb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.dsb")]
+    fn dsb(_: i32);
+
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.isb")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.isb")]
+    fn isb(_: i32);
+}
+
+// we put these in a module to prevent weirdness with glob re-exports
+mod arg {
+    // See Section 7.3  Memory barriers of ACLE
+    pub const SY: i32 = 15;
+    pub const ST: i32 = 14;
+    pub const LD: i32 = 13;
+    pub const ISH: i32 = 11;
+    pub const ISHST: i32 = 10;
+    pub const ISHLD: i32 = 9;
+    pub const NSH: i32 = 7;
+    pub const NSHST: i32 = 6;
+    pub const NSHLD: i32 = 5;
+    pub const OSH: i32 = 3;
+    pub const OSHST: i32 = 2;
+    pub const OSHLD: i32 = 1;
+}
diff --git a/crates/core_arch/src/acle/barrier/not_mclass.rs b/crates/core_arch/src/acle/barrier/not_mclass.rs
new file mode 100644
index 0000000000..385e1d5289
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/not_mclass.rs
@@ -0,0 +1,43 @@
+//! Access types available on v7 and v8 but not on v7(E)-M or v8-M
+
+/// Full system is the required shareability domain, writes are the required
+/// access type
+pub struct ST;
+
+dmb_dsb!(ST);
+
+/// Inner Shareable is the required shareability domain, reads and writes are
+/// the required access types
+pub struct ISH;
+
+dmb_dsb!(ISH);
+
+/// Inner Shareable is the required shareability domain, writes are the required
+/// access type
+pub struct ISHST;
+
+dmb_dsb!(ISHST);
+
+/// Non-shareable is the required shareability domain, reads and writes are the
+/// required access types
+pub struct NSH;
+
+dmb_dsb!(NSH);
+
+/// Non-shareable is the required shareability domain, writes are the required
+/// access type
+pub struct NSHST;
+
+dmb_dsb!(NSHST);
+
+/// Outer Shareable is the required shareability domain, reads and writes are
+/// the required access types
+pub struct OSH;
+
+dmb_dsb!(OSH);
+
+/// Outer Shareable is the required shareability domain, writes are the required
+/// access type
+pub struct OSHST;
+
+dmb_dsb!(OSHST);
diff --git a/crates/core_arch/src/acle/barrier/v8.rs b/crates/core_arch/src/acle/barrier/v8.rs
new file mode 100644
index 0000000000..2951a5a670
--- /dev/null
+++ b/crates/core_arch/src/acle/barrier/v8.rs
@@ -0,0 +1,23 @@
+/// Full system is the required shareability domain, reads are the required
+/// access type
+pub struct LD;
+
+dmb_dsb!(LD);
+
+/// Inner Shareable is the required shareability domain, reads are the required
+/// access type
+pub struct ISHLD;
+
+dmb_dsb!(ISHLD);
+
+/// Non-shareable is the required shareability domain, reads are the required
+/// access type
+pub struct NSHLD;
+
+dmb_dsb!(NSHLD);
+
+/// Outher Shareable is the required shareability domain, reads are the required
+/// access type
+pub struct OSHLD;
+
+dmb_dsb!(OSHLD);
diff --git a/crates/core_arch/src/acle/dsp.rs b/crates/core_arch/src/acle/dsp.rs
new file mode 100644
index 0000000000..e929e98e40
--- /dev/null
+++ b/crates/core_arch/src/acle/dsp.rs
@@ -0,0 +1,76 @@
+//! # References:
+//!
+//! - Section 8.3 "16-bit multiplications"
+//!
+//! Intrinsics that could live here:
+//!
+//! - [ ] __smulbb
+//! - [ ] __smulbt
+//! - [ ] __smultb
+//! - [ ] __smultt
+//! - [ ] __smulwb
+//! - [ ] __smulwt
+//! - [x] __qadd
+//! - [x] __qsub
+//! - [ ] __qdbl
+//! - [ ] __smlabb
+//! - [ ] __smlabt
+//! - [ ] __smlatb
+//! - [ ] __smlatt
+//! - [ ] __smlawb
+//! - [ ] __smlawt
+
+#[cfg(test)]
+use stdsimd_test::assert_instr;
+
+extern "C" {
+    #[link_name = "llvm.arm.qadd"]
+    fn arm_qadd(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub"]
+    fn arm_qsub(a: i32, b: i32) -> i32;
+
+}
+
+/// Signed saturating addition
+///
+/// Returns the 32-bit saturating signed equivalent of a + b.
+#[inline]
+#[cfg_attr(test, assert_instr(qadd))]
+pub unsafe fn __qadd(a: i32, b: i32) -> i32 {
+    arm_qadd(a, b)
+}
+
+/// Signed saturating subtraction
+///
+/// Returns the 32-bit saturating signed equivalent of a - b.
+#[inline]
+#[cfg_attr(test, assert_instr(qsub))]
+pub unsafe fn __qsub(a: i32, b: i32) -> i32 {
+    arm_qsub(a, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use core_arch::arm::*;
+    use std::mem;
+    use stdsimd_test::simd_test;
+
+    #[test]
+    fn qadd() {
+        unsafe {
+            assert_eq!(super::__qadd(-10, 60), 50);
+            assert_eq!(super::__qadd(::std::i32::MAX, 10), ::std::i32::MAX);
+            assert_eq!(super::__qadd(::std::i32::MIN, -10), ::std::i32::MIN);
+        }
+    }
+
+    #[test]
+    fn qsub() {
+        unsafe {
+            assert_eq!(super::__qsub(10, 60), -50);
+            assert_eq!(super::__qsub(::std::i32::MAX, -10), ::std::i32::MAX);
+            assert_eq!(super::__qsub(::std::i32::MIN, 10), ::std::i32::MIN);
+        }
+    }
+}
diff --git a/crates/core_arch/src/acle/ex.rs b/crates/core_arch/src/acle/ex.rs
new file mode 100644
index 0000000000..0426c65186
--- /dev/null
+++ b/crates/core_arch/src/acle/ex.rs
@@ -0,0 +1,117 @@
+// Reference: Section 5.4.4 "LDREX / STREX" of ACLE
+
+/// Removes the exclusive lock created by LDREX
+// Supported: v6, v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6-M
+// NOTE: there's no dedicated CLREX instruction in v6 (<v6k); to clear the exclusive monitor users
+// have to do a dummy STREX operation
+#[cfg(any(
+    all(target_feature = "v6k", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+))]
+pub unsafe fn __clrex() {
+    extern "C" {
+        #[link_name = "llvm.arm.clrex"]
+        fn clrex();
+    }
+
+    clrex()
+}
+
+/// Executes a exclusive LDR instruction for 8 bit value.
+// Supported: v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6, v6-M
+#[cfg(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+)]
+pub unsafe fn __ldrexb(p: *const u8) -> u8 {
+    extern "C" {
+        #[link_name = "llvm.arm.ldrex.p0i8"]
+        fn ldrex8(p: *const u8) -> u32;
+    }
+
+    ldrex8(p) as u8
+}
+
+/// Executes a exclusive LDR instruction for 16 bit value.
+// Supported: v6K, v7-M, v7-A, v7-R, v8
+// Not supported: v5, v6, v6-M
+#[cfg(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+)]
+pub unsafe fn __ldrexh(p: *const u16) -> u16 {
+    extern "C" {
+        #[link_name = "llvm.arm.ldrex.p0i16"]
+        fn ldrex16(p: *const u16) -> u32;
+    }
+
+    ldrex16(p) as u16
+}
+
+/// Executes a exclusive LDR instruction for 32 bit value.
+// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
+// Not supported: v5, v6-M
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+))]
+pub unsafe fn __ldrex(p: *const u32) -> u32 {
+    extern "C" {
+        #[link_name = "llvm.arm.ldrex.p0i32"]
+        fn ldrex32(p: *const u32) -> u32;
+    }
+
+    ldrex32(p)
+}
+
+/// Executes a exclusive STR instruction for 8 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// supported: v6K, v7-M, v7-A, v7-R
+// Not supported: v5, v6, v6-M
+#[cfg(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+)]
+pub unsafe fn __strexb(value: u32, addr: *mut u8) -> u32 {
+    extern "C" {
+        #[link_name = "llvm.arm.strex.p0i8"]
+        fn strex8(value: u32, addr: *mut u8) -> u32;
+    }
+
+    strex8(value, addr)
+}
+
+/// Executes a exclusive STR instruction for 16 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// Supported: v6K, v7-M, v7-A, v7-R, v8
+// Not supported: v5, v6, v6-M
+#[cfg(
+    target_feature = "v6k", // includes v7-M but excludes v6-M
+)]
+pub unsafe fn __strexh(value: u16, addr: *mut u16) -> u32 {
+    extern "C" {
+        #[link_name = "llvm.arm.strex.p0i16"]
+        fn strex16(value: u32, addr: *mut u16) -> u32;
+    }
+
+    strex16(value as u32, addr)
+}
+
+/// Executes a exclusive STR instruction for 32 bit values
+///
+/// Returns `0` if the operation succeeded, or `1` if it failed
+// Supported: v6, v7-M, v6K, v7-A, v7-R, v8
+// Not supported: v5, v6-M
+#[cfg(any(
+    all(target_feature = "v6", not(target_feature = "mclass")), // excludes v6-M
+    all(target_feature = "v7", target_feature = "mclass"), // v7-M
+))]
+pub unsafe fn __strex(value: u32, addr: *mut u32) -> u32 {
+    extern "C" {
+        #[link_name = "llvm.arm.strex.p0i32"]
+        fn strex32(value: u32, addr: *mut u32) -> u32;
+    }
+
+    strex32(value, addr)
+}
diff --git a/crates/core_arch/src/acle/hints.rs b/crates/core_arch/src/acle/hints.rs
new file mode 100644
index 0000000000..20faed69cb
--- /dev/null
+++ b/crates/core_arch/src/acle/hints.rs
@@ -0,0 +1,135 @@
+// # References
+//
+// - Section 7.4 "Hints" of ACLE
+// - Section 7.7 "NOP" of ACLE
+
+/// Generates a WFI (wait for interrupt) hint instruction, or nothing.
+///
+/// The WFI instruction allows (but does not require) the processor to enter a
+/// low-power state until one of a number of asynchronous events occurs.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
+#[inline(always)]
+pub unsafe fn __wfi() {
+    hint(HINT_WFI);
+}
+
+/// Generates a WFE (wait for event) hint instruction, or nothing.
+///
+/// The WFE instruction allows (but does not require) the processor to enter a
+/// low-power state until some event occurs such as a SEV being issued by
+/// another processor.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
+#[inline(always)]
+pub unsafe fn __wfe() {
+    hint(HINT_WFE);
+}
+
+/// Generates a SEV (send a global event) hint instruction.
+///
+/// This causes an event to be signaled to all processors in a multiprocessor
+/// system. It is a NOP on a uniprocessor system.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M, 7-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
+#[inline(always)]
+pub unsafe fn __sev() {
+    hint(HINT_SEV);
+}
+
+/// Generates a send a local event hint instruction.
+///
+/// This causes an event to be signaled to only the processor executing this
+/// instruction. In a multiprocessor system, it is not required to affect the
+/// other processors.
+// LLVM says "instruction requires: armv8"
+#[cfg(any(
+    target_feature = "v8", // 32-bit ARMv8
+    target_arch = "aarch64", // AArch64
+))]
+#[inline(always)]
+pub unsafe fn __sevl() {
+    hint(HINT_SEVL);
+}
+
+/// Generates a YIELD hint instruction.
+///
+/// This enables multithreading software to indicate to the hardware that it is
+/// performing a task, for example a spin-lock, that could be swapped out to
+/// improve overall system performance.
+// Section 10.1 of ACLE says that the supported arches are: 8, 6K, 6-M
+// LLVM says "instruction requires: armv6k"
+#[cfg(any(target_feature = "v6", target_arch = "aarch64"))]
+#[inline(always)]
+pub unsafe fn __yield() {
+    hint(HINT_YIELD);
+}
+
+/// Generates a DBG instruction.
+///
+/// This provides a hint to debugging and related systems. The argument must be
+/// a constant integer from 0 to 15 inclusive. See implementation documentation
+/// for the effect (if any) of this instruction and the meaning of the
+/// argument. This is available only when compliling for AArch32.
+// Section 10.1 of ACLE says that the supported arches are: 7, 7-M
+// "The DBG hint instruction is added in ARMv7. It is UNDEFINED in the ARMv6 base architecture, and
+// executes as a NOP instruction in ARMv6K and ARMv6T2." - ARM Architecture Reference Manual ARMv7-A
+// and ARMv7-R edition (ARM DDI 0406C.c) sections D12.4.1 "ARM instruction set support" and D12.4.2
+// "Thumb instruction set support"
+#[cfg(target_feature = "v7")]
+#[inline(always)]
+#[rustc_args_required_const(0)]
+pub unsafe fn __dbg(imm4: u32) {
+    macro_rules! call {
+        ($imm4:expr) => {
+            asm!(concat!("DBG ", stringify!($imm4)) : : : : "volatile")
+        }
+    }
+
+    match imm4 & 0b1111 {
+        0 => call!(0),
+        1 => call!(1),
+        2 => call!(2),
+        3 => call!(3),
+        4 => call!(4),
+        5 => call!(5),
+        6 => call!(6),
+        7 => call!(7),
+        8 => call!(8),
+        9 => call!(9),
+        10 => call!(10),
+        11 => call!(11),
+        12 => call!(12),
+        13 => call!(13),
+        14 => call!(14),
+        _ => call!(15),
+    }
+}
+
+/// Generates an unspecified no-op instruction.
+///
+/// Note that not all architectures provide a distinguished NOP instruction. On
+/// those that do, it is unspecified whether this intrinsic generates it or
+/// another instruction. It is not guaranteed that inserting this instruction
+/// will increase execution time.
+#[inline(always)]
+pub unsafe fn __nop() {
+    asm!("NOP" : : : : "volatile")
+}
+
+extern "C" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.hint")]
+    #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.hint")]
+    fn hint(_: i32);
+}
+
+// from LLVM 7.0.1's lib/Target/ARM/{ARMInstrThumb,ARMInstrInfo,ARMInstrThumb2}.td
+const HINT_NOP: i32 = 0;
+const HINT_YIELD: i32 = 1;
+const HINT_WFE: i32 = 2;
+const HINT_WFI: i32 = 3;
+const HINT_SEV: i32 = 4;
+const HINT_SEVL: i32 = 5;
diff --git a/crates/core_arch/src/acle/mod.rs b/crates/core_arch/src/acle/mod.rs
new file mode 100644
index 0000000000..5f29decf5a
--- /dev/null
+++ b/crates/core_arch/src/acle/mod.rs
@@ -0,0 +1,158 @@
+//! ARM C Language Extensions (ACLE)
+//!
+//! # Developer notes
+//!
+//! Below is a list of built-in targets that are representative of the different ARM
+//! architectures; the list includes the `target_feature`s they possess.
+//!
+//! - `armv4t-unknown-linux-gnueabi` - **ARMv4** - `+v4t`
+//! - `armv5te-unknown-linux-gnueabi` - **ARMv5TE** - `+v4t +v5te`
+//! - `arm-unknown-linux-gnueabi` - **ARMv6** - `+v4t +v5te +v6`
+//! - `thumbv6m-none-eabi` - **ARMv6-M** - `+v4t +v5te +v6 +thumb-mode +mclass`
+//! - `armv7-unknown-linux-gnueabihf` - **ARMv7-A** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +dsp +thumb2 +aclass`
+//! - `armv7r-none-eabi` - **ARMv7-R** - `+v4t +v5te +v6 +v6k +v6t2  +v7 +dsp +thumb2 +rclass`
+//! - `thumbv7m-none-eabi` - **ARMv7-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +thumb2 +thumb-mode +mclass`
+//! - `thumbv7em-none-eabi` - **ARMv7E-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +dsp +thumb2 +thumb-mode +mclass`
+//! - `thumbv8m.main-none-eabi` - **ARMv8-M** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +thumb2 +thumb-mode +mclass`
+//! - `armv8r-none-eabi` - **ARMv8-R** - `+v4t +v5te +v6 +v6k +v6t2 +v7 +v8 +thumb2 +rclass`
+//! - `aarch64-unknown-linux-gnu` - **ARMv8-A (AArch64)** - `+fp +neon`
+//!
+//! Section 10.1 of ACLE says:
+//!
+//! - "In the sequence of Arm architectures { v5, v5TE, v6, v6T2, v7 } each architecture includes
+//! its predecessor instruction set."
+//!
+//! - "In the sequence of Thumb-only architectures { v6-M, v7-M, v7E-M } each architecture includes
+//! its predecessor instruction set."
+//!
+//! From that info and from looking at how LLVM features work (using custom targets) we can identify
+//! features that are subsets of others:
+//!
+//! Legend: `a < b` reads as "`a` is a subset of `b`"; this means that if `b` is enabled then `a` is
+//! enabled as well.
+//!
+//! - `v4t < v5te < v6 < v6k < v6t2 < v7 < v8`
+//! - `v6 < v8m < v6t2`
+//! - `v7 < v8m.main`
+//!
+//! *NOTE*: Section 5.4.7 of ACLE says:
+//!
+//! - "__ARM_FEATURE_DSP is defined to 1 if the DSP (v5E) instructions are supported and the
+//! intrinsics defined in Saturating intrinsics are available."
+//!
+//! This does *not* match how LLVM uses the '+dsp' feature; this feature is not set for v5te
+//! targets so we have to work around this difference.
+//!
+//! # References
+//!
+//! - [ACLE Q2 2018](https://developer.arm.com/docs/101028/latest)
+
+// 8, 7 and 6-M are supported via dedicated instructions like DMB. All other arches are supported
+// via CP15 instructions. See Section 10.1 of ACLE
+mod barrier;
+
+pub use self::barrier::*;
+
+mod hints;
+
+pub use self::hints::*;
+
+mod registers;
+
+pub use self::registers::*;
+
+mod ex;
+
+pub use self::ex::*;
+
+// Supported arches: 5TE, 7E-M. See Section 10.1 of ACLE (e.g. QADD)
+// We also include the A profile even though DSP is deprecated on that profile as of ACLE 2.0 (see
+// section 5.4.7)
+// Here we workaround the difference between LLVM's +dsp and ACLE's __ARM_FEATURE_DSP by gating on
+// '+v5te' rather than on '+dsp'
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    any(
+        // >= v5TE but excludes v7-M
+        all(target_feature = "v5te", not(target_feature = "mclass")),
+        // v7E-M
+        all(target_feature = "mclass", target_feature = "dsp"),
+    )
+))]
+mod dsp;
+
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    any(
+        all(target_feature = "v5te", not(target_feature = "mclass")),
+        all(target_feature = "mclass", target_feature = "dsp"),
+    )
+))]
+pub use self::dsp::*;
+
+// Supported arches: 6, 7-M. See Section 10.1 of ACLE (e.g. SSAT)
+#[cfg(all(not(target_arch = "aarch64"), target_feature = "v6",))]
+mod sat;
+
+#[cfg(all(not(target_arch = "aarch64"), target_feature = "v6",))]
+pub use self::sat::*;
+
+// Deprecated in ACLE 2.0 for the A profile but fully supported on the M and R profiles, says
+// Section 5.4.9 of ACLE. We'll expose these for the A profile even if deprecated
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    any(
+        // v7-A, v7-R
+        all(target_feature = "v6", not(target_feature = "mclass")),
+        // v7E-M
+        all(target_feature = "mclass", target_feature = "dsp")
+    )
+))]
+mod simd32;
+
+#[cfg(all(
+    not(target_arch = "aarch64"),
+    any(
+        all(target_feature = "v6", not(target_feature = "mclass")),
+        all(target_feature = "mclass", target_feature = "dsp")
+    )
+))]
+pub use self::simd32::*;
+
+mod sealed {
+    pub trait Dmb {
+        unsafe fn __dmb(&self);
+    }
+
+    pub trait Dsb {
+        unsafe fn __dsb(&self);
+    }
+
+    pub trait Isb {
+        unsafe fn __isb(&self);
+    }
+
+    pub trait Rsr {
+        unsafe fn __rsr(&self) -> u32;
+    }
+
+    pub trait Rsr64 {
+        unsafe fn __rsr64(&self) -> u64;
+    }
+
+    pub trait Rsrp {
+        unsafe fn __rsrp(&self) -> *const u8;
+    }
+
+    pub trait Wsr {
+        unsafe fn __wsr(&self, value: u32);
+    }
+
+    pub trait Wsr64 {
+        unsafe fn __wsr64(&self, value: u64);
+    }
+
+    pub trait Wsrp {
+        unsafe fn __wsrp(&self, value: *const u8);
+    }
+}
diff --git a/crates/core_arch/src/acle/registers/aarch32.rs b/crates/core_arch/src/acle/registers/aarch32.rs
new file mode 100644
index 0000000000..f59af5d3ae
--- /dev/null
+++ b/crates/core_arch/src/acle/registers/aarch32.rs
@@ -0,0 +1,4 @@
+/// Application Program Status Register
+pub struct APSR;
+
+rsr!(APSR);
diff --git a/crates/core_arch/src/acle/registers/mod.rs b/crates/core_arch/src/acle/registers/mod.rs
new file mode 100644
index 0000000000..73fcc2c7b0
--- /dev/null
+++ b/crates/core_arch/src/acle/registers/mod.rs
@@ -0,0 +1,121 @@
+#[allow(unused_macros)]
+macro_rules! rsr {
+    ($R:ident) => {
+        impl super::super::sealed::Rsr for $R {
+            unsafe fn __rsr(&self) -> u32 {
+                let r: u32;
+                asm!(concat!("mrs $0,", stringify!($R)) : "=r"(r) : : : "volatile");
+                r
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! rsrp {
+    ($R:ident) => {
+        impl super::super::sealed::Rsrp for $R {
+            unsafe fn __rsrp(&self) -> *const u8 {
+                let r: *const u8;
+                asm!(concat!("mrs $0,", stringify!($R)) : "=r"(r) : : : "volatile");
+                r
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! wsr {
+    ($R:ident) => {
+        impl super::super::sealed::Wsr for $R {
+            unsafe fn __wsr(&self, value: u32) {
+                asm!(concat!("msr ", stringify!($R), ",$0") : : "r"(value) : : "volatile");
+            }
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! wsrp {
+    ($R:ident) => {
+        impl super::super::sealed::Wsrp for $R {
+            unsafe fn __wsrp(&self, value: *const u8) {
+                asm!(concat!("msr ", stringify!($R), ",$0") : : "r"(value) : : "volatile");
+            }
+        }
+    };
+}
+
+#[cfg(target_feature = "mclass")]
+mod v6m;
+
+#[cfg(target_feature = "mclass")]
+pub use self::v6m::*;
+
+#[cfg(all(target_feature = "v7", target_feature = "mclass"))]
+mod v7m;
+
+#[cfg(all(target_feature = "v7", target_feature = "mclass"))]
+pub use self::v7m::*;
+
+#[cfg(not(target_arch = "aarch64"))]
+mod aarch32;
+
+#[cfg(not(target_arch = "aarch64"))]
+pub use self::aarch32::*;
+
+/// Reads a 32-bit system register
+#[inline(always)]
+pub unsafe fn __rsr<R>(reg: R) -> u32
+where
+    R: super::sealed::Rsr,
+{
+    reg.__rsr()
+}
+
+/// Reads a 64-bit system register
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub unsafe fn __rsr64<R>(reg: R) -> u64
+where
+    R: super::sealed::Rsr64,
+{
+    reg.__rsr64()
+}
+
+/// Reads a system register containing an address
+#[inline(always)]
+pub unsafe fn __rsrp<R>(reg: R) -> *const u8
+where
+    R: super::sealed::Rsrp,
+{
+    reg.__rsrp()
+}
+
+/// Writes a 32-bit system register
+#[inline(always)]
+pub unsafe fn __wsr<R>(reg: R, value: u32)
+where
+    R: super::sealed::Wsr,
+{
+    reg.__wsr(value)
+}
+
+/// Writes a 64-bit system register
+#[cfg(target_arch = "aarch64")]
+#[inline(always)]
+pub unsafe fn __wsr64<R>(reg: R, value: u64)
+where
+    R: super::sealed::Wsr64,
+{
+    reg.__wsr64(value)
+}
+
+/// Writes a system register containing an address
+#[inline(always)]
+pub unsafe fn __wsrp<R>(reg: R, value: *const u8)
+where
+    R: super::sealed::Wsrp,
+{
+    reg.__wsrp(value)
+}
diff --git a/crates/core_arch/src/acle/registers/v6m.rs b/crates/core_arch/src/acle/registers/v6m.rs
new file mode 100644
index 0000000000..7acc63b6d1
--- /dev/null
+++ b/crates/core_arch/src/acle/registers/v6m.rs
@@ -0,0 +1,39 @@
+/// CONTROL register
+pub struct CONTROL;
+
+rsr!(CONTROL);
+wsr!(CONTROL);
+
+/// Execution Program Status Register
+pub struct EPSR;
+
+rsr!(EPSR);
+
+/// Interrupt Program Status Register
+pub struct IPSR;
+
+rsr!(IPSR);
+
+/// Main Stack Pointer
+pub struct MSP;
+
+rsrp!(MSP);
+wsrp!(MSP);
+
+/// Priority Mask Register
+pub struct PRIMASK;
+
+rsr!(PRIMASK);
+wsr!(PRIMASK);
+
+/// Process Stack Pointer
+pub struct PSP;
+
+rsrp!(PSP);
+wsrp!(PSP);
+
+/// Program Status Register
+#[allow(non_camel_case_types)]
+pub struct xPSR;
+
+rsr!(xPSR);
diff --git a/crates/core_arch/src/acle/registers/v7m.rs b/crates/core_arch/src/acle/registers/v7m.rs
new file mode 100644
index 0000000000..d1b1d474f1
--- /dev/null
+++ b/crates/core_arch/src/acle/registers/v7m.rs
@@ -0,0 +1,17 @@
+/// Base Priority Mask Register
+pub struct BASEPRI;
+
+rsr!(BASEPRI);
+wsr!(BASEPRI);
+
+/// Base Priority Mask Register (conditional write)
+#[allow(non_camel_case_types)]
+pub struct BASEPRI_MAX;
+
+wsr!(BASEPRI_MAX);
+
+/// Fault Mask Register
+pub struct FAULTMASK;
+
+rsr!(FAULTMASK);
+wsr!(FAULTMASK);
diff --git a/crates/core_arch/src/acle/sat.rs b/crates/core_arch/src/acle/sat.rs
new file mode 100644
index 0000000000..38c98d7342
--- /dev/null
+++ b/crates/core_arch/src/acle/sat.rs
@@ -0,0 +1,8 @@
+//! # References:
+//!
+//! - Section 8.4 "Saturating intrinsics"
+//!
+//! Intrinsics that could live here:
+//!
+//! - __ssat
+//! - __usat
diff --git a/crates/core_arch/src/arm/dsp.rs b/crates/core_arch/src/acle/simd32.rs
similarity index 75%
rename from crates/core_arch/src/arm/dsp.rs
rename to crates/core_arch/src/acle/simd32.rs
index 8385e7ed21..a259f90d2c 100644
--- a/crates/core_arch/src/arm/dsp.rs
+++ b/crates/core_arch/src/acle/simd32.rs
@@ -1,8 +1,66 @@
-//! ARM DSP Intrinsics.
+//! # References
 //!
-//! Based on "Arm C Language Extensions (ACLE) Version Q2 2018"
+//! - Section 8.5 "32-bit SIMD intrinsics" of ACLE
 //!
-//! https://developer.arm.com/products/software-development-tools/compilers/arm-compiler-5/docs/101028/0006
+//! Intrinsics that could live here
+//!
+//! - [x] __sel
+//! - [ ] __ssat16
+//! - [ ] __usat16
+//! - [ ] __sxtab16
+//! - [ ] __sxtb16
+//! - [ ] __uxtab16
+//! - [ ] __uxtb16
+//! - [x] __qadd8
+//! - [x] __qsub8
+//! - [x] __sadd8
+//! - [x] __shadd8
+//! - [x] __shsub8
+//! - [ ] __ssub8
+//! - [ ] __uadd8
+//! - [ ] __uhadd8
+//! - [ ] __uhsub8
+//! - [ ] __uqadd8
+//! - [ ] __uqsub8
+//! - [ ] __usub8
+//! - [x] __usad8
+//! - [x] __usada8
+//! - [x] __qadd16
+//! - [x] __qasx
+//! - [x] __qsax
+//! - [x] __qsub16
+//! - [x] __sadd16
+//! - [x] __sasx
+//! - [x] __shadd16
+//! - [ ] __shasx
+//! - [ ] __shsax
+//! - [x] __shsub16
+//! - [ ] __ssax
+//! - [ ] __ssub16
+//! - [ ] __uadd16
+//! - [ ] __uasx
+//! - [ ] __uhadd16
+//! - [ ] __uhasx
+//! - [ ] __uhsax
+//! - [ ] __uhsub16
+//! - [ ] __uqadd16
+//! - [ ] __uqasx
+//! - [x] __uqsax
+//! - [ ] __uqsub16
+//! - [ ] __usax
+//! - [ ] __usub16
+//! - [x] __smlad
+//! - [ ] __smladx
+//! - [ ] __smlald
+//! - [ ] __smlaldx
+//! - [x] __smlsd
+//! - [ ] __smlsdx
+//! - [ ] __smlsld
+//! - [ ] __smlsldx
+//! - [x] __smuad
+//! - [x] __smuadx
+//! - [x] __smusd
+//! - [x] __smusdx
 
 #[cfg(test)]
 use stdsimd_test::assert_instr;
@@ -25,45 +83,39 @@ macro_rules! dsp_call {
 }
 
 extern "C" {
-    #[link_name = "llvm.arm.qadd"]
-    fn arm_qadd(a: i32, b: i32) -> i32;
+    #[link_name = "llvm.arm.qadd8"]
+    fn arm_qadd8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub8"]
+    fn arm_qsub8(a: i32, b: i32) -> i32;
+
+    #[link_name = "llvm.arm.qsub16"]
+    fn arm_qsub16(a: i32, b: i32) -> i32;
 
     #[link_name = "llvm.arm.qadd16"]
     fn arm_qadd16(a: i32, b: i32) -> i32;
 
-    #[link_name = "llvm.arm.qadd8"]
-    fn arm_qadd8(a: i32, b: i32) -> i32;
-
     #[link_name = "llvm.arm.qasx"]
     fn arm_qasx(a: i32, b: i32) -> i32;
 
     #[link_name = "llvm.arm.qsax"]
     fn arm_qsax(a: i32, b: i32) -> i32;
 
-    #[link_name = "llvm.arm.qsub"]
-    fn arm_qsub(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.qsub8"]
-    fn arm_qsub8(a: i32, b: i32) -> i32;
-
-    #[link_name = "llvm.arm.qsub16"]
-    fn arm_qsub16(a: i32, b: i32) -> i32;
-
     #[link_name = "llvm.arm.sadd16"]
     fn arm_sadd16(a: i32, b: i32) -> i32;
 
     #[link_name = "llvm.arm.sadd8"]
     fn arm_sadd8(a: i32, b: i32) -> i32;
 
-    #[link_name = "llvm.arm.sasx"]
-    fn arm_sasx(a: i32, b: i32) -> i32;
-
     #[link_name = "llvm.arm.smlad"]
     fn arm_smlad(a: i32, b: i32, c: i32) -> i32;
 
     #[link_name = "llvm.arm.smlsd"]
     fn arm_smlsd(a: i32, b: i32, c: i32) -> i32;
 
+    #[link_name = "llvm.arm.sasx"]
+    fn arm_sasx(a: i32, b: i32) -> i32;
+
     #[link_name = "llvm.arm.sel"]
     fn arm_sel(a: i32, b: i32) -> i32;
 
@@ -95,24 +147,6 @@ extern "C" {
     fn arm_usad8(a: i32, b: i32) -> u32;
 }
 
-/// Signed saturating addition
-///
-/// Returns the 32-bit saturating signed equivalent of a + b.
-#[inline]
-#[cfg_attr(test, assert_instr(qadd))]
-pub unsafe fn qadd(a: i32, b: i32) -> i32 {
-    arm_qadd(a, b)
-}
-
-/// Signed saturating subtraction
-///
-/// Returns the 32-bit saturating signed equivalent of a - b.
-#[inline]
-#[cfg_attr(test, assert_instr(qsub))]
-pub unsafe fn qsub(a: i32, b: i32) -> i32 {
-    arm_qsub(a, b)
-}
-
 /// Saturating four 8-bit integer additions
 ///
 /// Returns the 8-bit signed equivalent of
@@ -123,7 +157,7 @@ pub unsafe fn qsub(a: i32, b: i32) -> i32 {
 /// res\[3\] = a\[3\] + b\[3\]
 #[inline]
 #[cfg_attr(test, assert_instr(qadd8))]
-pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_qadd8, a, b)
 }
 
@@ -137,7 +171,7 @@ pub unsafe fn qadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[3\] = a\[3\] - b\[3\]
 #[inline]
 #[cfg_attr(test, assert_instr(qsub8))]
-pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_qsub8, a, b)
 }
 
@@ -149,7 +183,7 @@ pub unsafe fn qsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[1\] = a\[1\] - b\[1\]
 #[inline]
 #[cfg_attr(test, assert_instr(qsub16))]
-pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_qsub16, a, b)
 }
 
@@ -161,7 +195,7 @@ pub unsafe fn qsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// res\[1\] = a\[1\] + b\[1\]
 #[inline]
 #[cfg_attr(test, assert_instr(qadd16))]
-pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_qadd16, a, b)
 }
 
@@ -171,7 +205,7 @@ pub unsafe fn qadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// res\[1\] = a\[1\] + b\[0\]
 #[inline]
 #[cfg_attr(test, assert_instr(qasx))]
-pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_qasx, a, b)
 }
 
@@ -181,7 +215,7 @@ pub unsafe fn qasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// res\[1\] = a\[1\] - b\[0\]
 #[inline]
 #[cfg_attr(test, assert_instr(qsax))]
-pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_qsax, a, b)
 }
 
@@ -193,7 +227,7 @@ pub unsafe fn qsax(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// and the GE bits of the APSR are set.
 #[inline]
 #[cfg_attr(test, assert_instr(sadd16))]
-pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_sadd16, a, b)
 }
 
@@ -207,7 +241,7 @@ pub unsafe fn sadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// and the GE bits of the APSR are set.
 #[inline]
 #[cfg_attr(test, assert_instr(sadd8))]
-pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_sadd8, a, b)
 }
 
@@ -218,7 +252,7 @@ pub unsafe fn sadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res = a\[0\] * b\[0\] + a\[1\] * b\[1\] + c
 #[inline]
 #[cfg_attr(test, assert_instr(smlad))]
-pub unsafe fn smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+pub unsafe fn __smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
     arm_smlad(::mem::transmute(a), ::mem::transmute(b), c)
 }
 
@@ -229,7 +263,7 @@ pub unsafe fn smlad(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
 /// res = a\[0\] * b\[0\] - a\[1\] * b\[1\] + c
 #[inline]
 #[cfg_attr(test, assert_instr(smlsd))]
-pub unsafe fn smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
+pub unsafe fn __smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
     arm_smlsd(::mem::transmute(a), ::mem::transmute(b), c)
 }
 
@@ -241,7 +275,7 @@ pub unsafe fn smlsd(a: int16x2_t, b: int16x2_t, c: i32) -> i32 {
 /// and the GE bits of the APSR are set.
 #[inline]
 #[cfg_attr(test, assert_instr(sasx))]
-pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_sasx, a, b)
 }
 
@@ -257,8 +291,7 @@ pub unsafe fn sasx(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// where GE are bits of APSR
 #[inline]
 #[cfg_attr(test, assert_instr(sel))]
-#[cfg(all(not(target_feature = "mclass")))]
-pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_sel, a, b)
 }
 
@@ -272,7 +305,7 @@ pub unsafe fn sel(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[3\] = (a\[3\] + b\[3\]) / 2
 #[inline]
 #[cfg_attr(test, assert_instr(shadd8))]
-pub unsafe fn shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_shadd8, a, b)
 }
 
@@ -284,7 +317,7 @@ pub unsafe fn shadd8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[1\] = (a\[1\] + b\[1\]) / 2
 #[inline]
 #[cfg_attr(test, assert_instr(shadd16))]
-pub unsafe fn shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_shadd16, a, b)
 }
 
@@ -298,7 +331,7 @@ pub unsafe fn shadd16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// res\[3\] = (a\[3\] - b\[3\]) / 2
 #[inline]
 #[cfg_attr(test, assert_instr(shsub8))]
-pub unsafe fn shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
+pub unsafe fn __shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
     dsp_call!(arm_shsub8, a, b)
 }
 
@@ -310,7 +343,7 @@ pub unsafe fn shsub8(a: int8x4_t, b: int8x4_t) -> int8x4_t {
 /// res\[1\] = (a\[1\] - b\[1\]) / 2
 #[inline]
 #[cfg_attr(test, assert_instr(shsub16))]
-pub unsafe fn shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
+pub unsafe fn __shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
     dsp_call!(arm_shsub16, a, b)
 }
 
@@ -323,7 +356,7 @@ pub unsafe fn shsub16(a: int16x2_t, b: int16x2_t) -> int16x2_t {
 /// and sets the Q flag if overflow occurs on the addition.
 #[inline]
 #[cfg_attr(test, assert_instr(smuad))]
-pub unsafe fn smuad(a: int16x2_t, b: int16x2_t) -> i32 {
+pub unsafe fn __smuad(a: int16x2_t, b: int16x2_t) -> i32 {
     arm_smuad(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -336,7 +369,7 @@ pub unsafe fn smuad(a: int16x2_t, b: int16x2_t) -> i32 {
 /// and sets the Q flag if overflow occurs on the addition.
 #[inline]
 #[cfg_attr(test, assert_instr(smuadx))]
-pub unsafe fn smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
+pub unsafe fn __smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
     arm_smuadx(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -349,7 +382,7 @@ pub unsafe fn smuadx(a: int16x2_t, b: int16x2_t) -> i32 {
 /// and sets the Q flag if overflow occurs on the addition.
 #[inline]
 #[cfg_attr(test, assert_instr(smusd))]
-pub unsafe fn smusd(a: int16x2_t, b: int16x2_t) -> i32 {
+pub unsafe fn __smusd(a: int16x2_t, b: int16x2_t) -> i32 {
     arm_smusd(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -362,7 +395,7 @@ pub unsafe fn smusd(a: int16x2_t, b: int16x2_t) -> i32 {
 /// and sets the Q flag if overflow occurs on the addition.
 #[inline]
 #[cfg_attr(test, assert_instr(smusdx))]
-pub unsafe fn smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
+pub unsafe fn __smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
     arm_smusdx(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -374,7 +407,7 @@ pub unsafe fn smusdx(a: int16x2_t, b: int16x2_t) -> i32 {
 ///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\])
 #[inline]
 #[cfg_attr(test, assert_instr(usad8))]
-pub unsafe fn usad8(a: int8x4_t, b: int8x4_t) -> u32 {
+pub unsafe fn __usad8(a: int8x4_t, b: int8x4_t) -> u32 {
     arm_usad8(::mem::transmute(a), ::mem::transmute(b))
 }
 
@@ -386,42 +419,23 @@ pub unsafe fn usad8(a: int8x4_t, b: int8x4_t) -> u32 {
 ///          (a\[2\] - b\[2\]) + (a\[3\] - b\[3\]) + c
 #[inline]
 #[cfg_attr(test, assert_instr(usad8))]
-pub unsafe fn usad8a(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
-    usad8(a, b) + c
+pub unsafe fn __usada8(a: int8x4_t, b: int8x4_t, c: u32) -> u32 {
+    __usad8(a, b) + c
 }
 
 #[cfg(test)]
 mod tests {
-    use core_arch::arm::*;
-    use core_arch::simd::*;
+    use core_arch::simd::{i16x2, i8x4};
     use std::mem;
     use stdsimd_test::simd_test;
 
-    #[test]
-    fn qadd() {
-        unsafe {
-            assert_eq!(dsp::qadd(-10, 60), 50);
-            assert_eq!(dsp::qadd(::std::i32::MAX, 10), ::std::i32::MAX);
-            assert_eq!(dsp::qadd(::std::i32::MIN, -10), ::std::i32::MIN);
-        }
-    }
-
-    #[test]
-    fn qsub() {
-        unsafe {
-            assert_eq!(dsp::qsub(10, 60), -50);
-            assert_eq!(dsp::qsub(::std::i32::MAX, -10), ::std::i32::MAX);
-            assert_eq!(dsp::qsub(::std::i32::MIN, 10), ::std::i32::MIN);
-        }
-    }
-
     #[test]
     fn qadd8() {
         unsafe {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(3, 1, 3, ::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::qadd8, a, b);
+            let r: i8x4 = dsp_call!(super::__qadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -432,7 +446,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MIN);
             let b = i8x4::new(2, -1, 0, 1);
             let c = i8x4::new(-1, 3, 3, ::std::i8::MIN);
-            let r: i8x4 = dsp_call!(dsp::qsub8, a, b);
+            let r: i8x4 = dsp_call!(super::__qsub8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -443,7 +457,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(2, -1);
             let c = i16x2::new(3, 1);
-            let r: i16x2 = dsp_call!(dsp::qadd16, a, b);
+            let r: i16x2 = dsp_call!(super::__qadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -454,7 +468,7 @@ mod tests {
             let a = i16x2::new(10, 20);
             let b = i16x2::new(20, -10);
             let c = i16x2::new(-10, 30);
-            let r: i16x2 = dsp_call!(dsp::qsub16, a, b);
+            let r: i16x2 = dsp_call!(super::__qsub16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -465,7 +479,7 @@ mod tests {
             let a = i16x2::new(1, ::std::i16::MAX);
             let b = i16x2::new(2, 2);
             let c = i16x2::new(-1, ::std::i16::MAX);
-            let r: i16x2 = dsp_call!(dsp::qasx, a, b);
+            let r: i16x2 = dsp_call!(super::__qasx, a, b);
             assert_eq!(r, c);
         }
     }
@@ -476,7 +490,7 @@ mod tests {
             let a = i16x2::new(1, ::std::i16::MAX);
             let b = i16x2::new(2, 2);
             let c = i16x2::new(3, ::std::i16::MAX - 2);
-            let r: i16x2 = dsp_call!(dsp::qsax, a, b);
+            let r: i16x2 = dsp_call!(super::__qsax, a, b);
             assert_eq!(r, c);
         }
     }
@@ -487,7 +501,7 @@ mod tests {
             let a = i16x2::new(1, ::std::i16::MAX);
             let b = i16x2::new(2, 2);
             let c = i16x2::new(3, -::std::i16::MAX);
-            let r: i16x2 = dsp_call!(dsp::sadd16, a, b);
+            let r: i16x2 = dsp_call!(super::__sadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -498,7 +512,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(4, 3, 2, 2);
             let c = i8x4::new(5, 5, 5, -::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::sadd8, a, b);
+            let r: i8x4 = dsp_call!(super::__sadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -509,7 +523,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(2, 1);
             let c = i16x2::new(0, 4);
-            let r: i16x2 = dsp_call!(dsp::sasx, a, b);
+            let r: i16x2 = dsp_call!(super::__sasx, a, b);
             assert_eq!(r, c);
         }
     }
@@ -519,7 +533,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(3, 4);
-            let r = dsp::smlad(::mem::transmute(a), ::mem::transmute(b), 10);
+            let r = super::__smlad(::mem::transmute(a), ::mem::transmute(b), 10);
             assert_eq!(r, (1 * 3) + (2 * 4) + 10);
         }
     }
@@ -529,7 +543,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(3, 4);
-            let r = dsp::smlsd(::mem::transmute(a), ::mem::transmute(b), 10);
+            let r = super::__smlsd(::mem::transmute(a), ::mem::transmute(b), 10);
             assert_eq!(r, ((1 * 3) - (2 * 4)) + 10);
         }
     }
@@ -540,9 +554,9 @@ mod tests {
             let a = i8x4::new(1, 2, 3, ::std::i8::MAX);
             let b = i8x4::new(4, 3, 2, 2);
             // call sadd8() to set GE bits
-            dsp::sadd8(::mem::transmute(a), ::mem::transmute(b));
+            super::__sadd8(::mem::transmute(a), ::mem::transmute(b));
             let c = i8x4::new(1, 2, 3, ::std::i8::MAX);
-            let r: i8x4 = dsp_call!(dsp::sel, a, b);
+            let r: i8x4 = dsp_call!(super::__sel, a, b);
             assert_eq!(r, c);
         }
     }
@@ -553,7 +567,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(5, 4, 3, 2);
             let c = i8x4::new(3, 3, 3, 3);
-            let r: i8x4 = dsp_call!(dsp::shadd8, a, b);
+            let r: i8x4 = dsp_call!(super::__shadd8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -564,7 +578,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
             let c = i16x2::new(3, 3);
-            let r: i16x2 = dsp_call!(dsp::shadd16, a, b);
+            let r: i16x2 = dsp_call!(super::__shadd16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -575,7 +589,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(5, 4, 3, 2);
             let c = i8x4::new(-2, -1, 0, 1);
-            let r: i8x4 = dsp_call!(dsp::shsub8, a, b);
+            let r: i8x4 = dsp_call!(super::__shsub8, a, b);
             assert_eq!(r, c);
         }
     }
@@ -586,7 +600,7 @@ mod tests {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
             let c = i16x2::new(-2, -1);
-            let r: i16x2 = dsp_call!(dsp::shsub16, a, b);
+            let r: i16x2 = dsp_call!(super::__shsub16, a, b);
             assert_eq!(r, c);
         }
     }
@@ -596,7 +610,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::smuad(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__smuad(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, 13);
         }
     }
@@ -606,7 +620,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::smuadx(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__smuadx(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, 14);
         }
     }
@@ -616,7 +630,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::smusd(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__smusd(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, -3);
         }
     }
@@ -626,7 +640,7 @@ mod tests {
         unsafe {
             let a = i16x2::new(1, 2);
             let b = i16x2::new(5, 4);
-            let r = dsp::smusdx(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__smusdx(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, -6);
         }
     }
@@ -636,7 +650,7 @@ mod tests {
         unsafe {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(4, 3, 2, 1);
-            let r = dsp::usad8(::mem::transmute(a), ::mem::transmute(b));
+            let r = super::__usad8(::mem::transmute(a), ::mem::transmute(b));
             assert_eq!(r, 8);
         }
     }
@@ -647,7 +661,7 @@ mod tests {
             let a = i8x4::new(1, 2, 3, 4);
             let b = i8x4::new(4, 3, 2, 1);
             let c = 10;
-            let r = dsp::usad8a(::mem::transmute(a), ::mem::transmute(b), c);
+            let r = super::__usada8(::mem::transmute(a), ::mem::transmute(b), c);
             assert_eq!(r, 8 + c);
         }
     }
diff --git a/crates/core_arch/src/arm/cmsis.rs b/crates/core_arch/src/arm/cmsis.rs
deleted file mode 100644
index bc8509d3e8..0000000000
--- a/crates/core_arch/src/arm/cmsis.rs
+++ /dev/null
@@ -1,330 +0,0 @@
-//! CMSIS: Cortex Microcontroller Software Interface Standard
-//!
-//! The version 5 of the standard can be found at:
-//!
-//! http://arm-software.github.io/CMSIS_5/Core/html/index.html
-//!
-//! The API reference of the standard can be found at:
-//!
-//! - Core function access -- http://arm-software.github.io/CMSIS_5/Core/html/group__Core__Register__gr.html
-//! - Intrinsic functions for CPU instructions -- http://arm-software.github.io/CMSIS_5/Core/html/group__intrinsic__CPU__gr.html
-//!
-//! The reference C implementation used as the base of this Rust port can be
-//! found at
-//!
-//! https://github.com/ARM-software/CMSIS_5/blob/5.3.0/CMSIS/Core/Include/cmsis_gcc.h
-
-#![allow(non_snake_case)]
-
-/* Core function access */
-
-/// Enable IRQ Interrupts
-///
-/// Enables IRQ interrupts by clearing the I-bit in the CPSR. Can only be
-/// executed in Privileged modes.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(cpsie))]
-pub unsafe fn __enable_irq() {
-    asm!("cpsie i" : : : "memory" : "volatile");
-}
-
-/// Disable IRQ Interrupts
-///
-/// Disables IRQ interrupts by setting the I-bit in the CPSR. Can only be
-/// executed in Privileged modes.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(cpsid))]
-pub unsafe fn __disable_irq() {
-    asm!("cpsid i" : : : "memory" : "volatile");
-}
-
-/// Get Control Register
-///
-/// Returns the content of the Control Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_CONTROL() -> u32 {
-    let result: u32;
-    asm!("mrs $0, CONTROL" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Set Control Register
-///
-/// Writes the given value to the Control Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(msr))]
-pub unsafe fn __set_CONTROL(control: u32) {
-    asm!("msr CONTROL, $0" : : "r"(control) : "memory" : "volatile");
-}
-
-/// Get IPSR Register
-///
-/// Returns the content of the IPSR Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_IPSR() -> u32 {
-    let result: u32;
-    asm!("mrs $0, IPSR" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Get APSR Register
-///
-/// Returns the content of the APSR Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_APSR() -> u32 {
-    let result: u32;
-    asm!("mrs $0, APSR" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Get xPSR Register
-///
-/// Returns the content of the xPSR Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_xPSR() -> u32 {
-    let result: u32;
-    asm!("mrs $0, XPSR" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Get Process Stack Pointer
-///
-/// Returns the current value of the Process Stack Pointer (PSP).
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_PSP() -> u32 {
-    let result: u32;
-    asm!("mrs $0, PSP" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Set Process Stack Pointer
-///
-/// Assigns the given value to the Process Stack Pointer (PSP).
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(msr))]
-pub unsafe fn __set_PSP(top_of_proc_stack: u32) {
-    asm!("msr PSP, $0" : : "r"(top_of_proc_stack) : : "volatile");
-}
-
-/// Get Main Stack Pointer
-///
-/// Returns the current value of the Main Stack Pointer (MSP).
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_MSP() -> u32 {
-    let result: u32;
-    asm!("mrs $0, MSP" : "=r"(result) : : : "volatile");
-    result
-}
-
-/// Set Main Stack Pointer
-///
-/// Assigns the given value to the Main Stack Pointer (MSP).
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(msr))]
-pub unsafe fn __set_MSP(top_of_main_stack: u32) {
-    asm!("msr MSP, $0" : : "r"(top_of_main_stack) : : "volatile");
-}
-
-/// Get Priority Mask
-///
-/// Returns the current state of the priority mask bit from the Priority Mask
-/// Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(mrs))]
-pub unsafe fn __get_PRIMASK() -> u32 {
-    let result: u32;
-    asm!("mrs $0, PRIMASK" : "=r"(result) : : "memory" : "volatile");
-    result
-}
-
-/// Set Priority Mask
-///
-/// Assigns the given value to the Priority Mask Register.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(msr))]
-pub unsafe fn __set_PRIMASK(pri_mask: u32) {
-    asm!("msr PRIMASK, $0" : : "r"(pri_mask) : : "volatile");
-}
-
-#[cfg(any(target_feature = "v7", dox))]
-mod v7 {
-    /// Enable FIQ
-    ///
-    /// Enables FIQ interrupts by clearing the F-bit in the CPSR. Can only be
-    /// executed in Privileged modes.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(cpsie))]
-    pub unsafe fn __enable_fault_irq() {
-        asm!("cpsie f" : : : "memory" : "volatile");
-    }
-
-    /// Disable FIQ
-    ///
-    /// Disables FIQ interrupts by setting the F-bit in the CPSR. Can only be
-    /// executed in Privileged modes.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(cpsid))]
-    pub unsafe fn __disable_fault_irq() {
-        asm!("cpsid f" : : : "memory" : "volatile");
-    }
-
-    /// Get Base Priority
-    ///
-    /// Returns the current value of the Base Priority register.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(mrs))]
-    pub unsafe fn __get_BASEPRI() -> u32 {
-        let result: u32;
-        asm!("mrs $0, BASEPRI" : "=r"(result) : : : "volatile");
-        result
-    }
-
-    /// Set Base Priority
-    ///
-    /// Assigns the given value to the Base Priority register.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(msr))]
-    pub unsafe fn __set_BASEPRI(base_pri: u32) {
-        asm!("msr BASEPRI, $0" : : "r"(base_pri) : "memory" : "volatile");
-    }
-
-    /// Set Base Priority with condition
-    ///
-    /// Assigns the given value to the Base Priority register only if BASEPRI
-    /// masking is disabled, or the new value increases the BASEPRI
-    /// priority level.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(mrs))]
-    pub unsafe fn __set_BASEPRI_MAX(base_pri: u32) {
-        asm!("msr BASEPRI_MAX, $0" : : "r"(base_pri) : "memory" : "volatile");
-    }
-
-    /// Get Fault Mask
-    ///
-    /// Returns the current value of the Fault Mask register.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(mrs))]
-    pub unsafe fn __get_FAULTMASK() -> u32 {
-        let result: u32;
-        asm!("mrs $0, FAULTMASK" : "=r"(result) : : : "volatile");
-        result
-    }
-
-    /// Set Fault Mask
-    ///
-    /// Assigns the given value to the Fault Mask register.
-    #[inline]
-    #[target_feature(enable = "mclass")]
-    #[cfg_attr(test, assert_instr(msr))]
-    pub unsafe fn __set_FAULTMASK(fault_mask: u32) {
-        asm!("msr FAULTMASK, $0" : : "r"(fault_mask) : "memory" : "volatile");
-    }
-}
-
-#[cfg(any(target_feature = "v7", dox))]
-pub use self::v7::*;
-
-/* Core instruction access */
-
-/// No Operation
-///
-/// No Operation does nothing. This instruction can be used for code alignment
-/// purposes.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(nop))]
-pub unsafe fn __NOP() {
-    asm!("nop" : : : : "volatile");
-}
-
-/// Wait For Interrupt
-///
-/// Wait For Interrupt is a hint instruction that suspends execution until one
-/// of a number of events occurs.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(wfi))]
-pub unsafe fn __WFI() {
-    asm!("wfi" : : : : "volatile");
-}
-
-/// Wait For Event
-///
-/// Wait For Event is a hint instruction that permits the processor to enter a
-/// low-power state until one of a number of events occurs.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(wfe))]
-pub unsafe fn __WFE() {
-    asm!("wfe" : : : : "volatile");
-}
-
-/// Send Event
-///
-/// Send Event is a hint instruction. It causes an event to be signaled to the
-/// CPU.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(sev))]
-pub unsafe fn __SEV() {
-    asm!("sev" : : : : "volatile");
-}
-
-/// Instruction Synchronization Barrier
-///
-/// Instruction Synchronization Barrier flushes the pipeline in the processor,
-/// so that all instructions following the ISB are fetched from cache or
-/// memory, after the instruction has been completed.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(isb))]
-pub unsafe fn __ISB() {
-    asm!("isb 0xF" : : : "memory" : "volatile");
-}
-
-/// Data Synchronization Barrier
-///
-/// Acts as a special kind of Data Memory Barrier. It completes when all
-/// explicit memory accesses before this instruction complete.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(dsb))]
-pub unsafe fn __DSB() {
-    asm!("dsb 0xF" : : : "memory" : "volatile");
-}
-
-/// Data Memory Barrier
-///
-/// Ensures the apparent order of the explicit memory operations before and
-/// after the instruction, without ensuring their completion.
-#[inline]
-#[target_feature(enable = "mclass")]
-#[cfg_attr(test, assert_instr(dmb))]
-pub unsafe fn __DMB() {
-    asm!("dmb 0xF" : : : "memory" : "volatile");
-}
diff --git a/crates/core_arch/src/arm/mod.rs b/crates/core_arch/src/arm/mod.rs
index 30ff991f8d..e5b40c9bc7 100644
--- a/crates/core_arch/src/arm/mod.rs
+++ b/crates/core_arch/src/arm/mod.rs
@@ -11,11 +11,6 @@ mod armclang;
 
 pub use self::armclang::*;
 
-#[cfg(any(target_feature = "mclass", dox))]
-mod cmsis;
-#[cfg(any(target_feature = "mclass", dox))]
-pub use self::cmsis::*;
-
 mod v6;
 pub use self::v6::*;
 
@@ -24,11 +19,6 @@ mod v7;
 #[cfg(any(target_arch = "aarch64", target_feature = "v7"))]
 pub use self::v7::*;
 
-#[cfg(any(all(target_feature = "v7", not(target_feature = "mclass")), dox))]
-mod dsp;
-#[cfg(any(all(target_feature = "v7", not(target_feature = "mclass")), dox))]
-pub use self::dsp::*;
-
 // NEON is supported on AArch64, and on ARM when built with the v7 and neon
 // features. Building ARM without neon produces incorrect codegen.
 #[cfg(any(
@@ -44,6 +34,8 @@ mod neon;
 ))]
 pub use self::neon::*;
 
+pub use super::acle::*;
+
 #[cfg(test)]
 use stdsimd_test::assert_instr;
 
diff --git a/crates/core_arch/src/mod.rs b/crates/core_arch/src/mod.rs
index 9705e091ca..1ca811ee75 100644
--- a/crates/core_arch/src/mod.rs
+++ b/crates/core_arch/src/mod.rs
@@ -3,6 +3,9 @@
 #[macro_use]
 mod macros;
 
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+mod acle;
+
 mod simd;
 
 #[cfg_attr(
diff --git a/crates/stdsimd-test/src/lib.rs b/crates/stdsimd-test/src/lib.rs
index dec44401d9..66ee9dd894 100644
--- a/crates/stdsimd-test/src/lib.rs
+++ b/crates/stdsimd-test/src/lib.rs
@@ -153,6 +153,10 @@ pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
                 // in some cases exceed the limit.
                 "cvtpi2ps" => 25,
 
+                // core_arch/src/acle/simd32
+                "usad8" => 27,
+                "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" => 29,
+
                 // Original limit was 20 instructions, but ARM DSP Intrinsics
                 // are exactly 20 instructions long. So bump
                 // the limit to 22 instead of adding here a