diff --git a/cores/esp8266/core_esp8266_non32xfer.cpp b/cores/esp8266/core_esp8266_non32xfer.cpp index eb4741e1ed..0d7bb19196 100644 --- a/cores/esp8266/core_esp8266_non32xfer.cpp +++ b/cores/esp8266/core_esp8266_non32xfer.cpp @@ -18,10 +18,10 @@ */ /* - * This exception handler, allows for byte or short accesses to iRAM or PROGMEM - * to succeed without causing a crash. It is still preferred to use the xxx_P - * macros whenever possible, since they are probably 30x faster than this - * exception handler method. + * This exception handler handles EXCCAUSE_LOAD_STORE_ERROR. It allows for a + * byte or short access to iRAM or PROGMEM to succeed without causing a crash. + * When reading, it is still preferred to use the xxx_P macros when possible + * since they are probably 30x faster than this exception handler method. * * Code taken directly from @pvvx's public domain code in * https://github.com/pvvx/esp8266web/blob/master/app/sdklib/system/app_main.c @@ -37,6 +37,16 @@ #include #include +// All of these optimization were tried and now work +// These results were from irammem.ino using GCC 10.2 +// DRAM reference uint16 9 AVG cycles/transfer +// #pragma GCC optimize("O0") // uint16, 289 AVG cycles/transfer, IRAM: +180 +// #pragma GCC optimize("O1") // uint16, 241 AVG cycles/transfer, IRAM: +16 +#pragma GCC optimize("O2") // uint16, 230 AVG cycles/transfer, IRAM: +4 +// #pragma GCC optimize("O3") // uint16, 230 AVG cycles/transfer, IRAM: +4 +// #pragma GCC optimize("Ofast") // uint16, 230 AVG cycles/transfer, IRAM: +4 +// #pragma GCC optimize("Os") // uint16, 233 AVG cycles/transfer, IRAM: 27556 +0 + extern "C" { #define LOAD_MASK 0x00f00fu @@ -50,32 +60,55 @@ extern "C" { static fn_c_exception_handler_t old_c_handler = NULL; -static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, int cause) +static +IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, int cause) { do { /* - Had to split out some of the asm, compiler was reusing a register that it - needed later. A crash would come or go away with the slightest unrelated - changes elsewhere in the function. - - Register a15 was used for epc1, then clobbered for rsr. Maybe an - __asm("":::"memory") before starting the asm would help for these cases. - For this instance moved setting epc1 closer to where it was used. - Edit. "&" on output register would have resolved the problem. - Refactored to reduce and consolidate register usage. + In adapting the public domain version, a crash would come or go away with + the slightest unrelated changes elsewhere in the function. Observed that + register a15 was used for epc1, then clobbered by `rsr.` I now believe a + "&" on the output register would have resolved the problem. + + However, I have refactored the Extended ASM to reduce and consolidate + register usage and corrected the issue. + + The positioning of the Extended ASM block (as early as possible in the + compiled function) is in part controlled by the immediate need for + output variable `insn`. This placement aids in getting excvaddr read as + early as possible. */ - uint32_t insn; - __asm( - "movi %0, ~3\n\t" /* prepare a mask for the EPC */ - "and %0, %0, %1\n\t" /* apply mask for 32bit aligned base */ - "ssa8l %1\n\t" /* set up shift register for src op */ - "l32i %1, %0, 0\n\t" /* load part 1 */ - "l32i %0, %0, 4\n\t" /* load part 2 */ - "src %0, %0, %1\n\t" /* right shift to get faulting instruction */ - :"=&r"(insn) - :"r"(ef->epc) - : - ); + uint32_t insn, excvaddr; +#if 1 + { + uint32_t tmp; + __asm__ ( + "rsr.excvaddr %[vaddr]\n\t" /* Read faulting address as early as possible */ + "movi.n %[tmp], ~3\n\t" /* prepare a mask for the EPC */ + "and %[tmp], %[tmp], %[epc]\n\t" /* apply mask for 32-bit aligned base */ + "ssa8l %[epc]\n\t" /* set up shift register for src op */ + "l32i %[insn], %[tmp], 0\n\t" /* load part 1 */ + "l32i %[tmp], %[tmp], 4\n\t" /* load part 2 */ + "src %[insn], %[tmp], %[insn]\n\t" /* right shift to get faulting instruction */ + : [vaddr]"=&r"(excvaddr), [insn]"=&r"(insn), [tmp]"=&r"(tmp) + : [epc]"r"(ef->epc) :); + } + +#else + { + __asm__ __volatile__ ("rsr.excvaddr %0;" : "=r"(excvaddr):: "memory"); + /* + "C" reference code for the ASM to document intent. + May also prove useful when issolating possible issues with Extended ASM, + optimizations, new compilers, etc. + */ + uint32_t epc = ef->epc; + uint32_t *pWord = (uint32_t *)(epc & ~3); + uint64_t big_word = ((uint64_t)pWord[1] << 32) | pWord[0]; + uint32_t pos = (epc & 3) * 8; + insn = (uint32_t)(big_word >>= pos); + } +#endif uint32_t what = insn & LOAD_MASK; uint32_t valmask = 0; @@ -102,10 +135,6 @@ static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, --regno; /* account for skipped a1 in exception_frame */ } - uint32_t excvaddr; - /* read out the faulting address */ - __asm("rsr %0, EXCVADDR;" :"=r"(excvaddr)::); - #ifdef DEBUG_ESP_MMU /* debug option to validate address so we don't hide memory access bugs in APP */ if (mmu_is_iram((void *)excvaddr) || (is_read && mmu_is_icache((void *)excvaddr))) { @@ -114,31 +143,34 @@ static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, continue; /* fail */ } #endif - - if (is_read) { - /* Load, shift and mask down to correct size */ - uint32_t val = (*(uint32_t *)(excvaddr & ~0x3)); - val >>= (excvaddr & 0x3) * 8; - val &= valmask; - - /* Sign-extend for L16SI, if applicable */ - if (what == L16SI_MATCH && (val & 0x8000)) { - val |= 0xffff0000; + { + uint32_t *pWord = (uint32_t *)(excvaddr & ~0x3); + uint32_t pos = (excvaddr & 0x3) * 8; + uint32_t mem_val = *pWord; + + if (is_read) { + /* shift and mask down to correct size */ + mem_val >>= pos; + mem_val &= valmask; + + /* Sign-extend for L16SI, if applicable */ + if (what == L16SI_MATCH && (mem_val & 0x8000)) { + mem_val |= 0xffff0000; + } + + ef->a_reg[regno] = mem_val; /* carry out the load */ + + } else { /* is write */ + uint32_t val = ef->a_reg[regno]; /* get value to store from register */ + val <<= pos; + valmask <<= pos; + val &= valmask; + + /* mask out field, and merge */ + mem_val &= (~valmask); + mem_val |= val; + *pWord = mem_val; /* carry out the store */ } - - ef->a_reg[regno] = val; /* carry out the load */ - - } else { /* is write */ - uint32_t val = ef->a_reg[regno]; /* get value to store from register */ - val <<= (excvaddr & 0x3) * 8; - valmask <<= (excvaddr & 0x3) * 8; - val &= valmask; - - /* Load, mask out field, and merge */ - uint32_t dst_val = (*(uint32_t *)(excvaddr & ~0x3)); - dst_val &= (~valmask); - dst_val |= val; - (*(uint32_t *)(excvaddr & ~0x3)) = dst_val; /* carry out the store */ } ef->epc += 3; /* resume at following instruction */ @@ -201,6 +233,7 @@ static void _set_exception_handler_wrapper(int cause) { } } +void install_non32xfer_exception_handler(void) __attribute__((weak)); void install_non32xfer_exception_handler(void) { if (NULL == old_c_handler) { // Set the "C" exception handler the wrapper will call diff --git a/cores/esp8266/exc-c-wrapper-handler.S b/cores/esp8266/exc-c-wrapper-handler.S index f1ec1c391e..872702aa2d 100644 --- a/cores/esp8266/exc-c-wrapper-handler.S +++ b/cores/esp8266/exc-c-wrapper-handler.S @@ -189,6 +189,9 @@ _xtos_c_wrapper_handler: // Restore special registers l32i a14, a1, UEXC_sar + // load early - saves two cycles - @mhightower83 + movi a0, _xtos_return_from_exc + // For compatibility with Arduino interrupt architecture, we keep interrupts 100% // disabled. // /* @@ -201,7 +204,6 @@ _xtos_c_wrapper_handler: // rsil a12, 1 // XCHAL_EXCM_LEVEL rsil a12, 15 // All levels blocked. wsr a14, SAR - movi a0, _xtos_return_from_exc jx a0 /* FIXME: what about _GeneralException ? */