From a9e6fd8ef07ea207a8fa258da207bbd0523bc079 Mon Sep 17 00:00:00 2001 From: Marouen Ghodhbane Date: Wed, 9 Oct 2024 17:31:45 +0200 Subject: [PATCH 1/3] portable: aarch64_sre: add configUSE_TASK_FPU_SUPPORT support This is a direct backport of upstream commit [1] for aarch64 (legacy operation port) done under [2] The same code can be applied on the aarch SRE port to be able to enable FPU context saving on all tasks context switch to mitigate GCC optimization to use SIMD registers for copy. [1] "55eceb22: Add configUSE_TASK_FPU_SUPPORT to AARCH64 port (#1048)" [2] https://github.com/FreeRTOS/FreeRTOS-Kernel/pull/1048 Signed-off-by: Marouen Ghodhbane --- portable/GCC/ARM_AARCH64_SRE/port.c | 56 +++++++++++++++++++----- portable/GCC/ARM_AARCH64_SRE/portmacro.h | 15 +++++-- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/portable/GCC/ARM_AARCH64_SRE/port.c b/portable/GCC/ARM_AARCH64_SRE/port.c index 1c96c0efd5..8cebcaf54e 100644 --- a/portable/GCC/ARM_AARCH64_SRE/port.c +++ b/portable/GCC/ARM_AARCH64_SRE/port.c @@ -121,6 +121,10 @@ ::"r" ( portUNMASK_VALUE ) ); \ } +/* The space on the stack required to hold the FPU registers. + * There are 32 128-bit registers.*/ +#define portFPU_REGISTER_WORDS ( 32 * 2 ) + /*-----------------------------------------------------------*/ /* @@ -229,23 +233,47 @@ StackType_t * pxPortInitialiseStack( StackType_t * pxTopOfStack, *pxTopOfStack = ( StackType_t ) 0x00; /* XZR - has no effect, used so there are an even number of registers. */ pxTopOfStack--; *pxTopOfStack = ( StackType_t ) 0x00; /* R30 - procedure call link register. */ - pxTopOfStack--; - - *pxTopOfStack = portINITIAL_PSTATE; - pxTopOfStack--; - *pxTopOfStack = ( StackType_t ) pxCode; /* Exception return address. */ pxTopOfStack--; + *pxTopOfStack = portINITIAL_PSTATE; - /* The task will start with a critical nesting count of 0 as interrupts are - * enabled. */ - *pxTopOfStack = portNO_CRITICAL_NESTING; pxTopOfStack--; + *pxTopOfStack = ( StackType_t ) pxCode; /* Exception return address. */ - /* The task will start without a floating point context. A task that uses - * the floating point hardware must call vPortTaskUsesFPU() before executing - * any floating point instructions. */ - *pxTopOfStack = portNO_FLOATING_POINT_CONTEXT; + #if ( configUSE_TASK_FPU_SUPPORT == 1 ) + { + /* The task will start with a critical nesting count of 0 as interrupts are + * enabled. */ + pxTopOfStack--; + *pxTopOfStack = portNO_CRITICAL_NESTING; + + /* The task will start without a floating point context. A task that + * uses the floating point hardware must call vPortTaskUsesFPU() before + * executing any floating point instructions. */ + pxTopOfStack--; + *pxTopOfStack = portNO_FLOATING_POINT_CONTEXT; + } + #elif ( configUSE_TASK_FPU_SUPPORT == 2 ) + { + /* The task will start with a floating point context. Leave enough + * space for the registers - and ensure they are initialised to 0. */ + pxTopOfStack -= portFPU_REGISTER_WORDS; + memset( pxTopOfStack, 0x00, portFPU_REGISTER_WORDS * sizeof( StackType_t ) ); + + /* The task will start with a critical nesting count of 0 as interrupts are + * enabled. */ + pxTopOfStack--; + *pxTopOfStack = portNO_CRITICAL_NESTING; + + pxTopOfStack--; + *pxTopOfStack = pdTRUE; + ullPortTaskHasFPUContext = pdTRUE; + } + #else /* if ( configUSE_TASK_FPU_SUPPORT == 1 ) */ + { + #error "Invalid configUSE_TASK_FPU_SUPPORT setting - configUSE_TASK_FPU_SUPPORT must be set to 1, 2, or left undefined." + } + #endif /* if ( configUSE_TASK_FPU_SUPPORT == 1 ) */ return pxTopOfStack; } @@ -384,6 +412,8 @@ void FreeRTOS_Tick_Handler( void ) } /*-----------------------------------------------------------*/ +#if ( configUSE_TASK_FPU_SUPPORT != 2 ) + void vPortTaskUsesFPU( void ) { /* A task is registering the fact that it needs an FPU context. Set the @@ -393,6 +423,8 @@ void vPortTaskUsesFPU( void ) /* Consider initialising the FPSR here - but probably not necessary in * AArch64. */ } + +#endif /* configUSE_TASK_FPU_SUPPORT */ /*-----------------------------------------------------------*/ void vPortClearInterruptMask( UBaseType_t uxNewMaskValue ) diff --git a/portable/GCC/ARM_AARCH64_SRE/portmacro.h b/portable/GCC/ARM_AARCH64_SRE/portmacro.h index 296984d5b5..5810741d2f 100644 --- a/portable/GCC/ARM_AARCH64_SRE/portmacro.h +++ b/portable/GCC/ARM_AARCH64_SRE/portmacro.h @@ -135,9 +135,18 @@ extern void vPortInstallFreeRTOSVectorTable( void ); * handler for whichever peripheral is used to generate the RTOS tick. */ void FreeRTOS_Tick_Handler( void ); -/* Any task that uses the floating point unit MUST call vPortTaskUsesFPU() - * before any floating point instructions are executed. */ -void vPortTaskUsesFPU( void ); +/* If configUSE_TASK_FPU_SUPPORT is set to 1 (or left undefined) then tasks are + * created without an FPU context and must call vPortTaskUsesFPU() to give + * themselves an FPU context before using any FPU instructions. If + * configUSE_TASK_FPU_SUPPORT is set to 2 then all tasks will have an FPU context + * by default. */ +#if ( configUSE_TASK_FPU_SUPPORT != 2 ) + void vPortTaskUsesFPU( void ); +#else + /* Each task has an FPU context already, so define this function away to + * nothing to prevent it from being called accidentally. */ + #define vPortTaskUsesFPU() +#endif #define portTASK_USES_FLOATING_POINT() vPortTaskUsesFPU() #define portLOWEST_INTERRUPT_PRIORITY ( ( ( uint32_t ) configUNIQUE_INTERRUPT_PRIORITIES ) - 1UL ) From 05751adfe142ea7a9bae21854590ec8a06acaa44 Mon Sep 17 00:00:00 2001 From: Marouen Ghodhbane Date: Wed, 9 Oct 2024 18:34:55 +0200 Subject: [PATCH 2/3] portable: aarch64_sre: add the configuration and status registers to the fpu saved context FPSR and FPCR are two 64-bits registers where only the lower 32 bits are defined. Save them when doing context switch with FPU context saving enabled. Signed-off-by: Marouen Ghodhbane --- portable/GCC/ARM_AARCH64_SRE/port.c | 4 ++-- portable/GCC/ARM_AARCH64_SRE/portASM.S | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/portable/GCC/ARM_AARCH64_SRE/port.c b/portable/GCC/ARM_AARCH64_SRE/port.c index 8cebcaf54e..ffc34d746d 100644 --- a/portable/GCC/ARM_AARCH64_SRE/port.c +++ b/portable/GCC/ARM_AARCH64_SRE/port.c @@ -122,8 +122,8 @@ } /* The space on the stack required to hold the FPU registers. - * There are 32 128-bit registers.*/ -#define portFPU_REGISTER_WORDS ( 32 * 2 ) + * There are 32 128-bit plus 2 64-bit status registers.*/ +#define portFPU_REGISTER_WORDS ( (32 * 2) + 2 ) /*-----------------------------------------------------------*/ diff --git a/portable/GCC/ARM_AARCH64_SRE/portASM.S b/portable/GCC/ARM_AARCH64_SRE/portASM.S index ed3c031d09..8d69b2aa1a 100644 --- a/portable/GCC/ARM_AARCH64_SRE/portASM.S +++ b/portable/GCC/ARM_AARCH64_SRE/portASM.S @@ -87,7 +87,7 @@ LDR X0, ullPortTaskHasFPUContextConst LDR X2, [X0] - /* Save the FPU context, if any (32 128-bit registers). */ + /* Save the FPU context, if any (32 128-bit plus two 64-bit status registers). */ CMP X2, #0 B.EQ 1f STP Q0, Q1, [SP,#-0x20]! @@ -107,6 +107,11 @@ STP Q28, Q29, [SP,#-0x20]! STP Q30, Q31, [SP,#-0x20]! + /* Even though upper 32 bits of FPSR and FPCR are reserved, save and restore the whole 64 bits to keep 16-byte SP alignement. */ + MRS X9, FPSR + MRS X10, FPCR + STP X9, X10, [SP, #-0x10]! + 1: /* Store the critical nesting count and FPU context indicator. */ STP X2, X3, [SP, #-0x10]! @@ -157,6 +162,7 @@ /* Restore the FPU context, if any. */ CMP X2, #0 B.EQ 1f + LDP X9, X10, [SP], #0x10 LDP Q30, Q31, [SP], #0x20 LDP Q28, Q29, [SP], #0x20 LDP Q26, Q27, [SP], #0x20 @@ -173,6 +179,8 @@ LDP Q4, Q5, [SP], #0x20 LDP Q2, Q3, [SP], #0x20 LDP Q0, Q1, [SP], #0x20 + MSR FPSR, X9 + MSR FPCR, X10 1: LDP X2, X3, [SP], #0x10 /* SPSR and ELR. */ From 067bd6a4d54c030af6b49b7a899ef89f31fd8da8 Mon Sep 17 00:00:00 2001 From: Marouen Ghodhbane Date: Fri, 11 Oct 2024 15:36:42 +0200 Subject: [PATCH 3/3] portable: aarch64_sre: Add support for vApplicationFPUSafeIRQHandler The application writer needs to name their IRQ handler as: 1. vApplicationIRQHandler if the IRQ handler does not use FPU registers. 2. vApplicationFPUSafeIRQHandler is the IRQ handler uses FPU registers. When the application uses vApplicationFPUSafeIRQHandler, a default implementation of vApplicationIRQHandler is used which stores FPU registers and then calls vApplicationFPUSafeIRQHandler. Note that recent versions of GCC may use FP/SIMD registers to optimize 16-bytes copy and especially when using va_start()/va_arg() functions (e.g printing some thing in IRQ handlers may trigger usage of FPU registers) This implementation is heavily inspired by both the ARM_CA9 port and the ARM_CRx_No_GIC port done in [1] [1] https://github.com/FreeRTOS/FreeRTOS-Kernel/pull/1113 Signed-off-by: Marouen Ghodhbane --- portable/GCC/ARM_AARCH64_SRE/port.c | 27 ++++++++++ portable/GCC/ARM_AARCH64_SRE/portASM.S | 74 ++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/portable/GCC/ARM_AARCH64_SRE/port.c b/portable/GCC/ARM_AARCH64_SRE/port.c index ffc34d746d..ab9290d432 100644 --- a/portable/GCC/ARM_AARCH64_SRE/port.c +++ b/portable/GCC/ARM_AARCH64_SRE/port.c @@ -133,6 +133,27 @@ */ extern void vPortRestoreTaskContext( void ); +/* + * If the application provides an implementation of vApplicationIRQHandler(), + * then it will get called directly without saving the FPU registers on + * interrupt entry, and this weak implementation of + * vApplicationFPUSafeIRQHandler() is just provided to remove linkage errors - + * it should never actually get called so its implementation contains a + * call to configASSERT() that will always fail. + * + * If the application provides its own implementation of + * vApplicationFPUSafeIRQHandler() then the implementation of + * vApplicationIRQHandler() provided in portASM.S will save the FPU registers + * before calling it. + * + * Therefore, if the application writer wants FPU registers to be saved on + * interrupt entry their IRQ handler must be called + * vApplicationFPUSafeIRQHandler(), and if the application writer does not want + * FPU registers to be saved on interrupt entry their IRQ handler must be + * called vApplicationIRQHandler(). + */ +void vApplicationFPUSafeIRQHandler( uint32_t ulICCIAR ) __attribute__((weak) ); + /*-----------------------------------------------------------*/ /* A variable is used to keep track of the critical section nesting. This @@ -495,3 +516,9 @@ UBaseType_t uxPortSetInterruptMask( void ) #endif /* configASSERT_DEFINED */ /*-----------------------------------------------------------*/ + +void vApplicationFPUSafeIRQHandler( uint32_t ulICCIAR ) +{ + ( void ) ulICCIAR; + configASSERT( ( volatile void * ) NULL ); +} diff --git a/portable/GCC/ARM_AARCH64_SRE/portASM.S b/portable/GCC/ARM_AARCH64_SRE/portASM.S index 8d69b2aa1a..f1f59cd338 100644 --- a/portable/GCC/ARM_AARCH64_SRE/portASM.S +++ b/portable/GCC/ARM_AARCH64_SRE/portASM.S @@ -414,8 +414,82 @@ Exit_IRQ_No_Context_Switch: ERET +/****************************************************************************** + * If the application provides an implementation of vApplicationIRQHandler(), + * then it will get called directly without saving the FPU registers on + * interrupt entry, and this weak implementation of + * vApplicationIRQHandler() will not get called. + * + * If the application provides its own implementation of + * vApplicationFPUSafeIRQHandler() then this implementation of + * vApplicationIRQHandler() will be called, save the FPU registers, and then + * call vApplicationFPUSafeIRQHandler(). + * + * Therefore, if the application writer wants FPU registers to be saved on + * interrupt entry their IRQ handler must be called + * vApplicationFPUSafeIRQHandler(), and if the application writer does not want + * FPU registers to be saved on interrupt entry their IRQ handler must be + * called vApplicationIRQHandler(). + *****************************************************************************/ + +.align 8 +.weak vApplicationIRQHandler +.type vApplicationIRQHandler, %function +vApplicationIRQHandler: + /* Save LR and FP on the stack */ + STP X29, X30, [SP, #-0x10]! + + /* Save FPU registers (32 128-bits + 2 64-bits configuration and status registers) */ + STP Q0, Q1, [SP,#-0x20]! + STP Q2, Q3, [SP,#-0x20]! + STP Q4, Q5, [SP,#-0x20]! + STP Q6, Q7, [SP,#-0x20]! + STP Q8, Q9, [SP,#-0x20]! + STP Q10, Q11, [SP,#-0x20]! + STP Q12, Q13, [SP,#-0x20]! + STP Q14, Q15, [SP,#-0x20]! + STP Q16, Q17, [SP,#-0x20]! + STP Q18, Q19, [SP,#-0x20]! + STP Q20, Q21, [SP,#-0x20]! + STP Q22, Q23, [SP,#-0x20]! + STP Q24, Q25, [SP,#-0x20]! + STP Q26, Q27, [SP,#-0x20]! + STP Q28, Q29, [SP,#-0x20]! + STP Q30, Q31, [SP,#-0x20]! + + /* Even though upper 32 bits of FPSR and FPCR are reserved, save and restore the whole 64 bits to keep 16-byte SP alignement. */ + MRS X9, FPSR + MRS X10, FPCR + STP X9, X10, [SP, #-0x10]! + /* Call the C handler. */ + BL vApplicationFPUSafeIRQHandler + + /* Restore FPU registers */ + + LDP X9, X10, [SP], #0x10 + LDP Q30, Q31, [SP], #0x20 + LDP Q28, Q29, [SP], #0x20 + LDP Q26, Q27, [SP], #0x20 + LDP Q24, Q25, [SP], #0x20 + LDP Q22, Q23, [SP], #0x20 + LDP Q20, Q21, [SP], #0x20 + LDP Q18, Q19, [SP], #0x20 + LDP Q16, Q17, [SP], #0x20 + LDP Q14, Q15, [SP], #0x20 + LDP Q12, Q13, [SP], #0x20 + LDP Q10, Q11, [SP], #0x20 + LDP Q8, Q9, [SP], #0x20 + LDP Q6, Q7, [SP], #0x20 + LDP Q4, Q5, [SP], #0x20 + LDP Q2, Q3, [SP], #0x20 + LDP Q0, Q1, [SP], #0x20 + MSR FPSR, X9 + MSR FPCR, X10 + /* Restore FP and LR */ + LDP X29, X30, [SP], #0x10 + RET .align 8 pxCurrentTCBConst: .dword pxCurrentTCB