diff --git a/cores/esp8266/mmu_iram.h b/cores/esp8266/mmu_iram.h index 7ba06be9bb..f7b62ba8c2 100644 --- a/cores/esp8266/mmu_iram.h +++ b/cores/esp8266/mmu_iram.h @@ -26,13 +26,24 @@ extern "C" { #endif -//C This turns on range checking. Is this the value you want to trigger it? +// This turns on range checking. #ifdef DEBUG_ESP_CORE #define DEBUG_ESP_MMU #endif #if defined(CORE_MOCK) #define ets_uart_printf(...) do {} while(false) +#define XCHAL_INSTRAM0_VADDR 0x40000000 +#define XCHAL_INSTRAM1_VADDR 0x40100000 +#define XCHAL_INSTROM0_VADDR 0x40200000 +#else +#include // For config/core-isa.h +/* + Cautiously use XCHAL_..._VADDR values where possible. + While XCHAL_..._VADDR values in core-isa.h may define the Xtensa processor + CONFIG options, they are not always an indication of DRAM, IRAM, or ROM + size or position in the address space. +*/ #endif /* @@ -71,32 +82,34 @@ DBG_MMU_FLUSH(0) static inline __attribute__((always_inline)) bool mmu_is_iram(const void *addr) { - #define IRAM_START 0x40100000UL + const uintptr_t iram_start = (uintptr_t)XCHAL_INSTRAM1_VADDR; #ifndef MMU_IRAM_SIZE #if defined(__GNUC__) && !defined(CORE_MOCK) #warning "MMU_IRAM_SIZE was undefined, setting to 0x8000UL!" #endif - #define MMU_IRAM_SIZE 0x8000UL + #define MMU_IRAM_SIZE 0x8000ul #endif - #define IRAM_END (IRAM_START + MMU_IRAM_SIZE) + const uintptr_t iram_end = iram_start + MMU_IRAM_SIZE; - return (IRAM_START <= (uintptr_t)addr && IRAM_END > (uintptr_t)addr); + return (iram_start <= (uintptr_t)addr && iram_end > (uintptr_t)addr); } static inline __attribute__((always_inline)) bool mmu_is_dram(const void *addr) { - #define DRAM_START 0x3FF80000UL - #define DRAM_END 0x40000000UL + const uintptr_t dram_start = 0x3FFE8000ul; + // The start of the Boot ROM sits at the end of DRAM. 0x40000000ul; + const uintptr_t dram_end = (uintptr_t)XCHAL_INSTRAM0_VADDR; - return (DRAM_START <= (uintptr_t)addr && DRAM_END > (uintptr_t)addr); + return (dram_start <= (uintptr_t)addr && dram_end > (uintptr_t)addr); } static inline __attribute__((always_inline)) bool mmu_is_icache(const void *addr) { - #define ICACHE_START 0x40200000UL - #define ICACHE_END (ICACHE_START + 0x100000UL) + extern void _irom0_text_end(void); + const uintptr_t icache_start = (uintptr_t)XCHAL_INSTROM0_VADDR; + const uintptr_t icache_end = (uintptr_t)_irom0_text_end; - return (ICACHE_START <= (uintptr_t)addr && ICACHE_END > (uintptr_t)addr); + return (icache_start <= (uintptr_t)addr && icache_end > (uintptr_t)addr); } #ifdef DEBUG_ESP_MMU @@ -127,8 +140,26 @@ bool mmu_is_icache(const void *addr) { static inline __attribute__((always_inline)) uint8_t mmu_get_uint8(const void *p8) { ASSERT_RANGE_TEST_READ(p8); - uint32_t val = (*(uint32_t *)((uintptr_t)p8 & ~0x3)); - uint32_t pos = ((uintptr_t)p8 & 0x3) * 8; + // https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#how-do-we-type-pun-correctly + // Comply with strict-aliasing rules. Using memcpy is a Standards suggested + // method for type punning. The compiler optimizer will replace the memcpy + // with an `l32i` instruction. Using __builtin_memcpy to ensure we get the + // effects of the compiler optimization and not some #define version of + // memcpy. + void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u); + uint32_t val; + __builtin_memcpy(&val, v32, sizeof(uint32_t)); + // Use an empty ASM to reference the 32-bit value. This will block the + // compiler from immediately optimizing to an 8-bit or 16-bit load instruction + // against IRAM memory. (This approach was inspired by + // https://github.com/esp8266/Arduino/pull/7780#discussion_r548303374) + // This issue was seen when using a constant address with the GCC 10.3 + // compiler. + // As a general practice, I think referencing by way of Extended ASM R/W + // output register will stop the the compiler from reloading the value later + // as 8-bit load from IRAM. + asm volatile ("" :"+r"(val)); // inject 32-bit dependency + uint32_t pos = ((uintptr_t)p8 & 3u) * 8u; val >>= pos; return (uint8_t)val; } @@ -136,8 +167,11 @@ uint8_t mmu_get_uint8(const void *p8) { static inline __attribute__((always_inline)) uint16_t mmu_get_uint16(const uint16_t *p16) { ASSERT_RANGE_TEST_READ(p16); - uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3)); - uint32_t pos = ((uintptr_t)p16 & 0x3) * 8; + void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)0x3u); + uint32_t val; + __builtin_memcpy(&val, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(val)); + uint32_t pos = ((uintptr_t)p16 & 3u) * 8u; val >>= pos; return (uint16_t)val; } @@ -145,8 +179,11 @@ uint16_t mmu_get_uint16(const uint16_t *p16) { static inline __attribute__((always_inline)) int16_t mmu_get_int16(const int16_t *p16) { ASSERT_RANGE_TEST_READ(p16); - uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3)); - uint32_t pos = ((uintptr_t)p16 & 0x3) * 8; + void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u); + uint32_t val; + __builtin_memcpy(&val, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(val)); + uint32_t pos = ((uintptr_t)p16 & 3u) * 8u; val >>= pos; return (int16_t)val; } @@ -154,30 +191,43 @@ int16_t mmu_get_int16(const int16_t *p16) { static inline __attribute__((always_inline)) uint8_t mmu_set_uint8(void *p8, const uint8_t val) { ASSERT_RANGE_TEST_WRITE(p8); - uint32_t pos = ((uintptr_t)p8 & 0x3) * 8; + uint32_t pos = ((uintptr_t)p8 & 3u) * 8u; uint32_t sval = val << pos; - uint32_t valmask = 0x0FF << pos; + uint32_t valmask = 0x0FFu << pos; + + void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u); + uint32_t ival; + __builtin_memcpy(&ival, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(ival)); - uint32_t *p32 = (uint32_t *)((uintptr_t)p8 & ~0x3); - uint32_t ival = *p32; ival &= (~valmask); ival |= sval; - *p32 = ival; + /* + This 32-bit dependency injection does not appear to be needed with the + current GCC 10.3; however, that could change in the future versions. Or, I + may not have the right test for it to fail. + */ + asm volatile ("" :"+r"(ival)); + __builtin_memcpy(v32, &ival, sizeof(uint32_t)); return val; } static inline __attribute__((always_inline)) uint16_t mmu_set_uint16(uint16_t *p16, const uint16_t val) { ASSERT_RANGE_TEST_WRITE(p16); - uint32_t pos = ((uintptr_t)p16 & 0x3) * 8; + uint32_t pos = ((uintptr_t)p16 & 3u) * 8u; uint32_t sval = val << pos; - uint32_t valmask = 0x0FFFF << pos; + uint32_t valmask = 0x0FFFFu << pos; + + void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u); + uint32_t ival; + __builtin_memcpy(&ival, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(ival)); - uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3); - uint32_t ival = *p32; ival &= (~valmask); ival |= sval; - *p32 = ival; + asm volatile ("" :"+r"(ival)); + __builtin_memcpy(v32, &ival, sizeof(uint32_t)); return val; } @@ -185,32 +235,36 @@ static inline __attribute__((always_inline)) int16_t mmu_set_int16(int16_t *p16, const int16_t val) { ASSERT_RANGE_TEST_WRITE(p16); uint32_t sval = (uint16_t)val; - uint32_t pos = ((uintptr_t)p16 & 0x3) * 8; + uint32_t pos = ((uintptr_t)p16 & 3u) * 8u; sval <<= pos; - uint32_t valmask = 0x0FFFF << pos; + uint32_t valmask = 0x0FFFFu << pos; + + void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u); + uint32_t ival; + __builtin_memcpy(&ival, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(ival)); - uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3); - uint32_t ival = *p32; ival &= (~valmask); ival |= sval; - *p32 = ival; + asm volatile ("" :"+r"(ival)); + __builtin_memcpy(v32, &ival, sizeof(uint32_t)); return val; } #if (MMU_IRAM_SIZE > 32*1024) && !defined(MMU_SEC_HEAP) -extern void _text_end(void); #define MMU_SEC_HEAP mmu_sec_heap() #define MMU_SEC_HEAP_SIZE mmu_sec_heap_size() static inline __attribute__((always_inline)) void *mmu_sec_heap(void) { - uint32_t sec_heap = (uint32_t)_text_end + 32; - return (void *)(sec_heap &= ~7); + extern void _text_end(void); + uintptr_t sec_heap = (uintptr_t)_text_end + (uintptr_t)32u; + return (void *)(sec_heap &= ~(uintptr_t)7u); } static inline __attribute__((always_inline)) size_t mmu_sec_heap_size(void) { - return (size_t)0xC000UL - ((size_t)mmu_sec_heap() - 0x40100000UL); + return (size_t)0xC000ul - ((uintptr_t)mmu_sec_heap() - (uintptr_t)XCHAL_INSTRAM1_VADDR); } #endif diff --git a/libraries/esp8266/examples/IramReserve/IramReserve.ino b/libraries/esp8266/examples/IramReserve/IramReserve.ino index 344622f958..96d7479da4 100644 --- a/libraries/esp8266/examples/IramReserve/IramReserve.ino +++ b/libraries/esp8266/examples/IramReserve/IramReserve.ino @@ -17,6 +17,12 @@ #include #if defined(UMM_HEAP_IRAM) +#if defined(CORE_MOCK) +#define XCHAL_INSTRAM1_VADDR 0x40100000 +#else +#include // For config/core-isa.h +#endif + // durable - as in long life, persisting across reboots. struct durable { uint32_t bootCounter; @@ -30,7 +36,7 @@ struct durable { #define IRAM_RESERVE_SZ ((sizeof(struct durable) + 7UL) & ~7UL) // Position its address just above the reduced 2nd Heap. -#define IRAM_RESERVE (0x40100000UL + 0xC000UL - IRAM_RESERVE_SZ) +#define IRAM_RESERVE ((uintptr_t)XCHAL_INSTRAM1_VADDR + 0xC000UL - IRAM_RESERVE_SZ) // Define a reference with the right properties to make access easier. #define DURABLE ((struct durable *)IRAM_RESERVE) @@ -100,9 +106,9 @@ extern "C" void umm_init_iram(void) { adjustments and checksums. These can affect the persistence of data across reboots. */ - uint32_t sec_heap = (uint32_t)_text_end + 32; + uintptr_t sec_heap = (uintptr_t)_text_end + 32; sec_heap &= ~7; - size_t sec_heap_sz = 0xC000UL - (sec_heap - 0x40100000UL); + size_t sec_heap_sz = 0xC000UL - (sec_heap - (uintptr_t)XCHAL_INSTRAM1_VADDR); sec_heap_sz -= IRAM_RESERVE_SZ; // Shrink IRAM heap if (0xC000UL > sec_heap_sz) { diff --git a/libraries/esp8266/examples/MMU48K/MMU48K.ino b/libraries/esp8266/examples/MMU48K/MMU48K.ino index 6acb3840f9..d75d91232b 100644 --- a/libraries/esp8266/examples/MMU48K/MMU48K.ino +++ b/libraries/esp8266/examples/MMU48K/MMU48K.ino @@ -3,6 +3,12 @@ #include #include +#if defined(CORE_MOCK) +#define XCHAL_INSTRAM1_VADDR 0x40100000 +#else +#include // For config/core-isa.h +#endif + uint32_t timed_byte_read(char *pc, uint32_t * o); uint32_t timed_byte_read2(char *pc, uint32_t * o); int divideA_B(int a, int b); @@ -102,7 +108,7 @@ void print_mmu_status(Print& oStream) { #ifdef MMU_IRAM_SIZE oStream.printf_P(PSTR(" IRAM Size: %u"), MMU_IRAM_SIZE); oStream.println(); - const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - 0x40100000UL); + const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - (uintptr_t)XCHAL_INSTRAM1_VADDR); oStream.printf_P(PSTR(" IRAM free: %u"), iram_free); oStream.println(); #endif diff --git a/libraries/esp8266/examples/irammem/irammem.ino b/libraries/esp8266/examples/irammem/irammem.ino index 169d53e05e..dccc34a6a4 100644 --- a/libraries/esp8266/examples/irammem/irammem.ino +++ b/libraries/esp8266/examples/irammem/irammem.ino @@ -14,6 +14,204 @@ #define ETS_PRINTF ets_uart_printf #endif +/* + Verify mmu_get_uint16()'s compliance with strict-aliasing rules under + different optimizations. +*/ + +#pragma GCC push_options +// reference +#pragma GCC optimize("O0") // We expect -O0 to generate the correct results +__attribute__((noinline)) +void aliasTestReference(uint16_t *x) { + // Without adhearance to strict-aliasing, this sequence of code would fail + // when optimized by GCC Version 10.3 + size_t len = 3; + for (size_t u = 0; u < len; u++) { + uint16_t x1 = mmu_get_uint16(&x[0]); + for (size_t v = 0; v < len; v++) { + x[v] = mmu_get_uint16(&x[v]) + x1; + } + } +} +// Tests +#pragma GCC optimize("Os") +__attribute__((noinline)) +void aliasTestOs(uint16_t *x) { + size_t len = 3; + for (size_t u = 0; u < len; u++) { + uint16_t x1 = mmu_get_uint16(&x[0]); + for (size_t v = 0; v < len; v++) { + x[v] = mmu_get_uint16(&x[v]) + x1; + } + } +} +#pragma GCC optimize("O2") +__attribute__((noinline)) +void aliasTestO2(uint16_t *x) { + size_t len = 3; + for (size_t u = 0; u < len; u++) { + uint16_t x1 = mmu_get_uint16(&x[0]); + for (size_t v = 0; v < len; v++) { + x[v] = mmu_get_uint16(&x[v]) + x1; + } + } +} +#pragma GCC optimize("O3") +__attribute__((noinline)) +void aliasTestO3(uint16_t *x) { + size_t len = 3; + for (size_t u = 0; u < len; u++) { + uint16_t x1 = mmu_get_uint16(&x[0]); + for (size_t v = 0; v < len; v++) { + x[v] = mmu_get_uint16(&x[v]) + x1; + } + } +} + +// Evaluate if optomizer may have changed 32-bit access to 8-bit. +// 8-bit access will take longer as it will be processed thought +// the exception handler. For this case the -O0 version will appear faster. +#pragma GCC optimize("O0") +__attribute__((noinline)) IRAM_ATTR +uint32_t timedRead_Reference(uint8_t *res) { + // This test case was verified with GCC 10.3 + // There is a code case that can result in 32-bit wide IRAM load from memory + // being optimized down to an 8-bit memory access. In this test case we need + // to supply a constant IRAM address that is not 0 when anded with 3u. + // This section verifies that the workaround implimented by the inline + // function mmu_get_uint8() is preventing this. See comments for function + // mmu_get_uint8(() in mmu_iram.h for more details. + const uint8_t *x = (const uint8_t *)0x40100003ul; + uint32_t b = ESP.getCycleCount(); + *res = mmu_get_uint8(x); + return ESP.getCycleCount() - b; +} +#pragma GCC optimize("Os") +__attribute__((noinline)) IRAM_ATTR +uint32_t timedRead_Os(uint8_t *res) { + const uint8_t *x = (const uint8_t *)0x40100003ul; + uint32_t b = ESP.getCycleCount(); + *res = mmu_get_uint8(x); + return ESP.getCycleCount() - b; +} +#pragma GCC optimize("O2") +__attribute__((noinline)) IRAM_ATTR +uint32_t timedRead_O2(uint8_t *res) { + const uint8_t *x = (const uint8_t *)0x40100003ul; + uint32_t b = ESP.getCycleCount(); + *res = mmu_get_uint8(x); + return ESP.getCycleCount() - b; +} +#pragma GCC optimize("O3") +__attribute__((noinline)) IRAM_ATTR +uint32_t timedRead_O3(uint8_t *res) { + const uint8_t *x = (const uint8_t *)0x40100003ul; + uint32_t b = ESP.getCycleCount(); + *res = mmu_get_uint8(x); + return ESP.getCycleCount() - b; +} +#pragma GCC pop_options + +bool test4_32bit_loads() { + bool result = true; + uint8_t res; + uint32_t cycle_count_ref, cycle_count; + Serial.printf("\r\nFor mmu_get_uint8, verify that 32-bit wide IRAM access is preserved across different optimizations:\r\n"); + cycle_count_ref = timedRead_Reference(&res); + /* + If the optimizer (for options -Os, -O2, and -O3) replaces the 32-bit wide + IRAM access with an 8-bit, the exception handler will get invoked on memory + reads. The total execution time will show a significant increase when + compared to the reference (option -O0). + */ + Serial.printf(" Option -O0, cycle count %5u - reference\r\n", cycle_count_ref); + cycle_count = timedRead_Os(&res); + Serial.printf(" Option -Os, cycle count %5u ", cycle_count); + if (cycle_count_ref > cycle_count) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + } + cycle_count = timedRead_O2(&res); + Serial.printf(" Option -O2, cycle count %5u ", cycle_count); + if (cycle_count_ref > cycle_count) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + } + cycle_count = timedRead_O3(&res); + Serial.printf(" Option -O3, cycle count %5u ", cycle_count); + if (cycle_count_ref > cycle_count) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + } + return result; +} + +void printPunFail(uint16_t *ref, uint16_t *x, size_t sz) { + Serial.printf(" Expected:"); + for (size_t i = 0; i < sz; i++) { + Serial.printf(" %3u", ref[i]); + } + Serial.printf("\r\n Got: "); + for (size_t i = 0; i < sz; i++) { + Serial.printf(" %3u", x[i]); + } + Serial.printf("\r\n"); +} + +bool testPunning() { + bool result = true; + // Get reference result for verifing test + alignas(uint32_t) uint16_t x_ref[] = {1, 2, 3, 0}; + aliasTestReference(x_ref); // -O0 + Serial.printf("mmu_get_uint16() strict-aliasing tests with different optimizations:\r\n"); + + { + alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0}; + aliasTestOs(x); + Serial.printf(" Option -Os "); + if (0 == memcmp(x_ref, x, sizeof(x_ref))) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t)); + } + } + { + alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0}; + aliasTestO2(x); + Serial.printf(" Option -O2 "); + if (0 == memcmp(x_ref, x, sizeof(x_ref))) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t)); + } + } + { + alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0}; + aliasTestO3(x); + Serial.printf(" Option -O3 "); + if (0 == memcmp(x_ref, x, sizeof(x_ref))) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t)); + } + } + return result; +} + + uint32_t cyclesToRead_nKx32(int n, unsigned int *x, uint32_t *res) { uint32_t b = ESP.getCycleCount(); uint32_t sum = 0; @@ -95,7 +293,6 @@ uint32_t cyclesToWrite_nKx8(int n, unsigned char*x) { } // Compare with Inline - uint32_t cyclesToRead_nKx16_viaInline(int n, unsigned short *x, uint32_t *res) { uint32_t b = ESP.getCycleCount(); uint32_t sum = 0; @@ -159,6 +356,7 @@ uint32_t cyclesToWrite_nKx8_viaInline(int n, unsigned char*x) { return ESP.getCycleCount() - b; } + bool perfTest_nK(int nK, uint32_t *mem, uint32_t *imem) { uint32_t res, verify_res; uint32_t t; @@ -317,7 +515,7 @@ void setup() { Serial.println(); - if (perfTest_nK(1, mem, imem)) { + if (perfTest_nK(1, mem, imem) && testPunning() && test4_32bit_loads()) { Serial.println(); } else { Serial.println("\r\n*******************************");