zephyrproject-rtos · nashif · Apr 26, 2023 · Sep 11, 2022 · Mar 28, 2023 · Mar 28, 2023
diff --git a/arch/xtensa/core/xtensa-asm2.c b/arch/xtensa/core/xtensa-asm2.c
@@ -88,7 +88,7 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack,
 #ifdef CONFIG_KERNEL_COHERENCE
 	__ASSERT((((size_t)stack) % XCHAL_DCACHE_LINESIZE) == 0, "");
 	__ASSERT((((size_t)stack_ptr) % XCHAL_DCACHE_LINESIZE) == 0, "");
-	z_xtensa_cache_flush_inv(stack, (char *)stack_ptr - (char *)stack);
+	sys_cache_data_flush_and_invd_range(stack, (char *)stack_ptr - (char *)stack);
 #endif
 }
 

diff --git a/arch/xtensa/include/kernel_arch_func.h b/arch/xtensa/include/kernel_arch_func.h
@@ -13,7 +13,7 @@
 #ifndef _ASMLANGUAGE
 #include <kernel_internal.h>
 #include <string.h>
-#include <zephyr/arch/xtensa/cache.h>
+#include <zephyr/cache.h>
 #include <zsr.h>
 
 #ifdef __cplusplus
@@ -33,7 +33,7 @@ static ALWAYS_INLINE void arch_kernel_init(void)
 	/* Make sure we don't have live data for unexpected cached
 	 * regions due to boot firmware
 	 */
-	z_xtensa_cache_flush_inv_all();
+	sys_cache_data_flush_and_invd_all();
 
 	/* Our cache top stash location might have junk in it from a
 	 * pre-boot environment.  Must be zero or valid!
@@ -115,7 +115,7 @@ static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
 	 * automatically overwritten as needed.
 	 */
 	if (curr_cpu != new_thread->arch.last_cpu) {
-		z_xtensa_cache_inv((void *)nsp, (nstack + nsz) - nsp);
+		sys_cache_data_invd_range((void *)nsp, (nstack + nsz) - nsp);
 	}
 	old_thread->arch.last_cpu = curr_cpu;
 
@@ -143,8 +143,8 @@ static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
 	 * to the stack top stashed in a special register.
 	 */
 	if (old_switch_handle != NULL) {
-		z_xtensa_cache_flush((void *)osp, (ostack + osz) - osp);
-		z_xtensa_cache_inv((void *)ostack, osp - ostack);
+		sys_cache_data_flush_range((void *)osp, (ostack + osz) - osp);
+		sys_cache_data_invd_range((void *)ostack, osp - ostack);
 	} else {
 		/* When in a switch, our current stack is the outbound
 		 * stack.  Flush the single line containing the stack
@@ -155,8 +155,8 @@ static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
 		 */
 		__asm__ volatile("mov %0, a1" : "=r"(osp));
 		osp -= 16;
-		z_xtensa_cache_flush((void *)osp, 1);
-		z_xtensa_cache_inv((void *)ostack, osp - ostack);
+		sys_cache_data_flush_range((void *)osp, 1);
+		sys_cache_data_invd_range((void *)ostack, osp - ostack);
 
 		uint32_t end = ostack + osz;
 

diff --git a/drivers/mm/mm_drv_intel_adsp_mtl_tlb.c b/drivers/mm/mm_drv_intel_adsp_mtl_tlb.c
@@ -21,10 +21,11 @@
  */
 
 #include "mm_drv_intel_adsp.h"
-
+#include <soc_util.h>
 #include <zephyr/drivers/mm/mm_drv_intel_adsp_mtl_tlb.h>
 #include <zephyr/drivers/mm/mm_drv_bank.h>
 #include <zephyr/debug/sparse.h>
+#include <zephyr/cache.h>
 
 static struct k_spinlock tlb_lock;
 extern struct k_spinlock sys_mm_drv_common_lock;
@@ -269,7 +270,7 @@ int sys_mm_drv_map_page(void *virt, uintptr_t phys, uint32_t flags)
 	 * Invalid the cache of the newly mapped virtual page to
 	 * avoid stale data.
 	 */
-	z_xtensa_cache_inv(virt, CONFIG_MM_DRV_PAGE_SIZE);
+	sys_cache_data_invd_range(virt, CONFIG_MM_DRV_PAGE_SIZE);
 
 	k_spin_unlock(&tlb_lock, key);
 
@@ -356,7 +357,7 @@ int sys_mm_drv_unmap_page(void *virt)
 	 * Flush the cache to make sure the backing physical page
 	 * has the latest data.
 	 */
-	z_xtensa_cache_flush(virt, CONFIG_MM_DRV_PAGE_SIZE);
+	sys_cache_data_flush_range(virt, CONFIG_MM_DRV_PAGE_SIZE);
 
 	entry_idx = get_tlb_entry_idx(va);
 
@@ -581,8 +582,8 @@ int sys_mm_drv_move_region(void *virt_old, size_t size, void *virt_new,
 	 * flush the cache to make sure the backing physical
 	 * pages have the new data.
 	 */
-	z_xtensa_cache_flush(virt_new, size);
-	z_xtensa_cache_flush_inv(virt_old, size);
+	sys_cache_data_flush_range(virt_new, size);
+	sys_cache_data_flush_and_invd_range(virt_old, size);
 
 	return ret;
 }
@@ -603,7 +604,7 @@ int sys_mm_drv_move_array(void *virt_old, size_t size, void *virt_new,
 	 * flush the cache to make sure the backing physical
 	 * pages have the new data.
 	 */
-	z_xtensa_cache_flush(va_new, size);
+	sys_cache_data_flush_range(va_new, size);
 
 	return ret;
 }
@@ -722,7 +723,8 @@ static void adsp_mm_save_context(void *storage_buffer)
 			 * all cache data has been flushed before
 			 * do this for pages to remap only
 			 */
-			z_xtensa_cache_inv(UINT_TO_POINTER(phys_addr), CONFIG_MM_DRV_PAGE_SIZE);
+			sys_cache_data_invd_range(UINT_TO_POINTER(phys_addr),
+						  CONFIG_MM_DRV_PAGE_SIZE);
 
 			/* Enable the translation in the TLB entry */
 			entry |= TLB_ENABLE_BIT;
@@ -746,7 +748,7 @@ static void adsp_mm_save_context(void *storage_buffer)
 	*((uint32_t *) location) = 0;
 	location += sizeof(uint32_t);
 
-	z_xtensa_cache_flush(
+	sys_cache_data_flush_range(
 		storage_buffer,
 		(uint32_t)location - (uint32_t)storage_buffer);
 
@@ -788,7 +790,7 @@ __imr void adsp_mm_restore_context(void *storage_buffer)
 		bmemcpy(UINT_TO_POINTER(phys_addr_uncached),
 			location,
 			CONFIG_MM_DRV_PAGE_SIZE);
-		z_xtensa_cache_inv(UINT_TO_POINTER(phys_addr), CONFIG_MM_DRV_PAGE_SIZE);
+		sys_cache_data_invd_range(UINT_TO_POINTER(phys_addr), CONFIG_MM_DRV_PAGE_SIZE);
 
 		location += CONFIG_MM_DRV_PAGE_SIZE;
 		phys_addr = *((uint32_t *) location);

diff --git a/drivers/mm/mm_drv_intel_adsp_tlb.c b/drivers/mm/mm_drv_intel_adsp_tlb.c
@@ -29,6 +29,7 @@
 #include <zephyr/sys/check.h>
 #include <zephyr/sys/mem_manage.h>
 #include <zephyr/sys/util.h>
+#include <zephyr/cache.h>
 
 #include <soc.h>
 #include <adsp_memory.h>
@@ -132,7 +133,7 @@ int sys_mm_drv_map_page(void *virt, uintptr_t phys, uint32_t flags)
 	 * Invalid the cache of the newly mapped virtual page to
 	 * avoid stale data.
 	 */
-	z_xtensa_cache_inv(virt, CONFIG_MM_DRV_PAGE_SIZE);
+	sys_cache_data_invd_range(virt, CONFIG_MM_DRV_PAGE_SIZE);
 
 	k_spin_unlock(&tlb_lock, key);
 
@@ -185,7 +186,7 @@ int sys_mm_drv_unmap_page(void *virt)
 	 * Flush the cache to make sure the backing physical page
 	 * has the latest data.
 	 */
-	z_xtensa_cache_flush(virt, CONFIG_MM_DRV_PAGE_SIZE);
+	sys_cache_data_flush_range(virt, CONFIG_MM_DRV_PAGE_SIZE);
 
 	entry_idx = get_tlb_entry_idx(va);
 
@@ -302,7 +303,7 @@ int sys_mm_drv_move_region(void *virt_old, size_t size, void *virt_new,
 	 * flush the cache to make sure the backing physical
 	 * pages have the new data.
 	 */
-	z_xtensa_cache_flush(va_new, size);
+	sys_cache_data_flush_range(va_new, size);
 
 	return ret;
 }
@@ -323,7 +324,7 @@ int sys_mm_drv_move_array(void *virt_old, size_t size, void *virt_new,
 	 * flush the cache to make sure the backing physical
 	 * pages have the new data.
 	 */
-	z_xtensa_cache_flush(va_new, size);
+	sys_cache_data_flush_range(va_new, size);
 
 	return ret;
 }
diff --git a/drivers/neural_net/intel_gna.c b/drivers/neural_net/intel_gna.c
@@ -79,7 +79,7 @@ static void intel_gna_interrupt_handler(const struct device *dev)
 	if (k_msgq_get(&gna->request_queue, &pending_req, K_NO_WAIT) != 0) {
 		LOG_ERR("Pending request queue is empty");
 	} else {
-		z_xtensa_cache_inv(pending_req.model->output,
+		sys_cache_data_invd_range(pending_req.model->output,
 				pending_req.output_len);
 		/* copy output from the model buffer to application buffer */
 		memcpy(pending_req.output, pending_req.model->output,
@@ -194,7 +194,7 @@ static int intel_gna_initialize(const struct device *dev)
 			dev->name, gna_config_desc.vamaxaddr);
 
 	/* flush cache */
-	z_xtensa_cache_flush((void *)&gna_config_desc, sizeof(gna_config_desc));
+	sys_cache_data_flush_range((void *)&gna_config_desc, sizeof(gna_config_desc));
 
 	LOG_INF("%s: initialized (max %u models & max %u pending requests)",
 			dev->name, GNA_MAX_NUM_MODELS,
@@ -334,7 +334,7 @@ static int intel_gna_register_model(const struct device *dev,
 
 		intel_gna_setup_page_table(model->rw_region, rw_size,
 				virtual_base);
-		z_xtensa_cache_flush(model->rw_region, rw_size);
+		sys_cache_data_flush_range(model->rw_region, rw_size);
 	}
 
 	if (model->ro_region == NULL) {
@@ -352,8 +352,8 @@ static int intel_gna_register_model(const struct device *dev,
 	intel_gna_setup_page_table(ro_region, ro_size,
 			(void *)((uint32_t)virtual_base + rw_size));
 
-	z_xtensa_cache_flush(ro_region, ro_size);
-	z_xtensa_cache_flush(gna_page_table, sizeof(gna_page_table));
+	sys_cache_data_flush_range(ro_region, ro_size);
+	sys_cache_data_flush_range(gna_page_table, sizeof(gna_page_table));
 
 	/* copy the model pointers */
 	gna_model->model = *model;
@@ -461,12 +461,12 @@ static int intel_gna_infer(const struct device *dev,
 
 	/* copy input */
 	memcpy(handle->input, req->input, input_size);
-	z_xtensa_cache_flush(handle->input, input_size);
+	sys_cache_data_flush_range(handle->input, input_size);
 
 	/* assign layer descriptor base address to configuration descriptor */
 	gna_config_desc.labase = (uint32_t)handle->vabase;
 	gna_config_desc.lacnt = (uint16_t)header->layer_count;
-	z_xtensa_cache_flush(&gna_config_desc, sizeof(gna_config_desc));
+	sys_cache_data_flush_range(&gna_config_desc, sizeof(gna_config_desc));
 
 	gna->state = GNA_STATE_ACTIVE;
 	regs->gnactrl = (regs->gnactrl & ~GNA_CTRL_INTR_DISABLE) |

diff --git a/include/zephyr/arch/cache.h b/include/zephyr/arch/cache.h
@@ -21,6 +21,8 @@
 
 #if defined(CONFIG_ARM64)
 #include <zephyr/arch/arm64/cache.h>
+#elif defined(CONFIG_XTENSA)
+#include <zephyr/arch/xtensa/cache.h>
 #endif
 
 #if defined(CONFIG_DCACHE)

diff --git a/include/zephyr/arch/xtensa/arch.h b/include/zephyr/arch/xtensa/arch.h
@@ -28,6 +28,7 @@
 #include <xtensa/config/core.h>
 #include <zephyr/arch/common/addr_types.h>
 #include <zephyr/arch/xtensa/gdbstub.h>
+#include <zephyr/debug/sparse.h>
 
 #ifdef CONFIG_KERNEL_COHERENCE
 #define ARCH_STACK_PTR_ALIGN XCHAL_DCACHE_LINESIZE
@@ -84,6 +85,142 @@ static ALWAYS_INLINE void arch_nop(void)
 }
 #endif
 
+
+#if defined(CONFIG_XTENSA_RPO_CACHE)
+#if defined(CONFIG_ARCH_HAS_COHERENCE)
+static inline bool arch_mem_coherent(void *ptr)
+{
+	size_t addr = (size_t) ptr;
+
+	return (addr >> 29) == CONFIG_XTENSA_UNCACHED_REGION;
+}
+#endif
+
+static ALWAYS_INLINE uint32_t z_xtrpoflip(uint32_t addr, uint32_t rto, uint32_t rfrom)
+{
+	/* The math here is all compile-time: when the two regions
+	 * differ by a power of two, we can convert between them by
+	 * setting or clearing just one bit.  Otherwise it needs two
+	 * operations.
+	 */
+	uint32_t rxor = (rto ^ rfrom) << 29;
+
+	rto <<= 29;
+	if (Z_IS_POW2(rxor)) {
+		if ((rxor & rto) == 0) {
+			return addr & ~rxor;
+		} else {
+			return addr | rxor;
+		}
+	} else {
+		return (addr & ~(7U << 29)) | rto;
+	}
+}
+/**
+ * @brief Return cached pointer to a RAM address
+ *
+ * The Xtensa coherence architecture maps addressable RAM twice, in
+ * two different 512MB regions whose L1 cache settings can be
+ * controlled independently.  So for any given pointer, it is possible
+ * to convert it to and from a cached version.
+ *
+ * This function takes a pointer to any addressable object (either in
+ * cacheable memory or not) and returns a pointer that can be used to
+ * refer to the same memory through the L1 data cache.  Data read
+ * through the resulting pointer will reflect locally cached values on
+ * the current CPU if they exist, and writes will go first into the
+ * cache and be written back later.
+ *
+ * @see arch_xtensa_uncached_ptr()
+ *
+ * @param ptr A pointer to a valid C object
+ * @return A pointer to the same object via the L1 dcache
+ */
+static inline void __sparse_cache *arch_xtensa_cached_ptr(void *ptr)
+{
+	return (__sparse_force void __sparse_cache *)z_xtrpoflip((uint32_t) ptr,
+						CONFIG_XTENSA_CACHED_REGION,
+						CONFIG_XTENSA_UNCACHED_REGION);
+}
+
+/**
+ * @brief Return uncached pointer to a RAM address
+ *
+ * The Xtensa coherence architecture maps addressable RAM twice, in
+ * two different 512MB regions whose L1 cache settings can be
+ * controlled independently.  So for any given pointer, it is possible
+ * to convert it to and from a cached version.
+ *
+ * This function takes a pointer to any addressable object (either in
+ * cacheable memory or not) and returns a pointer that can be used to
+ * refer to the same memory while bypassing the L1 data cache.  Data
+ * in the L1 cache will not be inspected nor modified by the access.
+ *
+ * @see arch_xtensa_cached_ptr()
+ *
+ * @param ptr A pointer to a valid C object
+ * @return A pointer to the same object bypassing the L1 dcache
+ */
+static inline void *arch_xtensa_uncached_ptr(void __sparse_cache *ptr)
+{
+	return (void *)z_xtrpoflip((__sparse_force uint32_t)ptr,
+				   CONFIG_XTENSA_UNCACHED_REGION,
+				   CONFIG_XTENSA_CACHED_REGION);
+}
+
+/* Utility to generate an unrolled and optimal[1] code sequence to set
+ * the RPO TLB registers (contra the HAL cacheattr macros, which
+ * generate larger code and can't be called from C), based on the
+ * KERNEL_COHERENCE configuration in use.  Selects RPO attribute "2"
+ * for regions (including MMIO registers in region zero) which want to
+ * bypass L1, "4" for the cached region which wants writeback, and
+ * "15" (invalid) elsewhere.
+ *
+ * Note that on cores that have the "translation" option set, we need
+ * to put an identity mapping in the high bits.  Also per spec
+ * changing the current code region (by definition cached) requires
+ * that WITLB be followed by an ISYNC and that both instructions live
+ * in the same cache line (two 3-byte instructions fit in an 8-byte
+ * aligned region, so that's guaranteed not to cross a cache line
+ * boundary).
+ *
+ * [1] With the sole exception of gcc's infuriating insistence on
+ * emitting a precomputed literal for addr + addrincr instead of
+ * computing it with a single ADD instruction from values it already
+ * has in registers.  Explicitly assigning the variables to registers
+ * via an attribute works, but then emits needless MOV instructions
+ * instead.  I tell myself it's just 32 bytes of .text, but... Sigh.
+ */
+#define _REGION_ATTR(r)						\
+	((r) == 0 ? 2 :						\
+	 ((r) == CONFIG_XTENSA_CACHED_REGION ? 4 :		\
+	  ((r) == CONFIG_XTENSA_UNCACHED_REGION ? 2 : 15)))
+
+#define _SET_ONE_TLB(region) do {				\
+	uint32_t attr = _REGION_ATTR(region);			\
+	if (XCHAL_HAVE_XLT_CACHEATTR) {				\
+		attr |= addr; /* RPO with translation */	\
+	}							\
+	if (region != CONFIG_XTENSA_CACHED_REGION) {		\
+		__asm__ volatile("wdtlb %0, %1; witlb %0, %1"	\
+				 :: "r"(attr), "r"(addr));	\
+	} else {						\
+		__asm__ volatile("wdtlb %0, %1"			\
+				 :: "r"(attr), "r"(addr));	\
+		__asm__ volatile("j 1f; .align 8; 1:");		\
+		__asm__ volatile("witlb %0, %1; isync"		\
+				 :: "r"(attr), "r"(addr));	\
+	}							\
+	addr += addrincr;					\
+} while (0)
+
+#define ARCH_XTENSA_SET_RPO_TLB() do {				\
+	register uint32_t addr = 0, addrincr = 0x20000000;	\
+	FOR_EACH(_SET_ONE_TLB, (;), 0, 1, 2, 3, 4, 5, 6, 7);	\
+} while (0)
+
+#endif
+
 #endif /* !defined(_ASMLANGUAGE) && !defined(__ASSEMBLER__)  */
 
 #endif /* ZEPHYR_INCLUDE_ARCH_XTENSA_ARCH_H_ */