diff --git a/cores/esp8266/mmu_iram.h b/cores/esp8266/mmu_iram.h index 7ba06be9bb..f7b62ba8c2 100644 --- a/cores/esp8266/mmu_iram.h +++ b/cores/esp8266/mmu_iram.h @@ -26,13 +26,24 @@ extern "C" { #endif -//C This turns on range checking. Is this the value you want to trigger it? +// This turns on range checking. #ifdef DEBUG_ESP_CORE #define DEBUG_ESP_MMU #endif #if defined(CORE_MOCK) #define ets_uart_printf(...) do {} while(false) +#define XCHAL_INSTRAM0_VADDR 0x40000000 +#define XCHAL_INSTRAM1_VADDR 0x40100000 +#define XCHAL_INSTROM0_VADDR 0x40200000 +#else +#include // For config/core-isa.h +/* + Cautiously use XCHAL_..._VADDR values where possible. + While XCHAL_..._VADDR values in core-isa.h may define the Xtensa processor + CONFIG options, they are not always an indication of DRAM, IRAM, or ROM + size or position in the address space. +*/ #endif /* @@ -71,32 +82,34 @@ DBG_MMU_FLUSH(0) static inline __attribute__((always_inline)) bool mmu_is_iram(const void *addr) { - #define IRAM_START 0x40100000UL + const uintptr_t iram_start = (uintptr_t)XCHAL_INSTRAM1_VADDR; #ifndef MMU_IRAM_SIZE #if defined(__GNUC__) && !defined(CORE_MOCK) #warning "MMU_IRAM_SIZE was undefined, setting to 0x8000UL!" #endif - #define MMU_IRAM_SIZE 0x8000UL + #define MMU_IRAM_SIZE 0x8000ul #endif - #define IRAM_END (IRAM_START + MMU_IRAM_SIZE) + const uintptr_t iram_end = iram_start + MMU_IRAM_SIZE; - return (IRAM_START <= (uintptr_t)addr && IRAM_END > (uintptr_t)addr); + return (iram_start <= (uintptr_t)addr && iram_end > (uintptr_t)addr); } static inline __attribute__((always_inline)) bool mmu_is_dram(const void *addr) { - #define DRAM_START 0x3FF80000UL - #define DRAM_END 0x40000000UL + const uintptr_t dram_start = 0x3FFE8000ul; + // The start of the Boot ROM sits at the end of DRAM. 0x40000000ul; + const uintptr_t dram_end = (uintptr_t)XCHAL_INSTRAM0_VADDR; - return (DRAM_START <= (uintptr_t)addr && DRAM_END > (uintptr_t)addr); + return (dram_start <= (uintptr_t)addr && dram_end > (uintptr_t)addr); } static inline __attribute__((always_inline)) bool mmu_is_icache(const void *addr) { - #define ICACHE_START 0x40200000UL - #define ICACHE_END (ICACHE_START + 0x100000UL) + extern void _irom0_text_end(void); + const uintptr_t icache_start = (uintptr_t)XCHAL_INSTROM0_VADDR; + const uintptr_t icache_end = (uintptr_t)_irom0_text_end; - return (ICACHE_START <= (uintptr_t)addr && ICACHE_END > (uintptr_t)addr); + return (icache_start <= (uintptr_t)addr && icache_end > (uintptr_t)addr); } #ifdef DEBUG_ESP_MMU @@ -127,8 +140,26 @@ bool mmu_is_icache(const void *addr) { static inline __attribute__((always_inline)) uint8_t mmu_get_uint8(const void *p8) { ASSERT_RANGE_TEST_READ(p8); - uint32_t val = (*(uint32_t *)((uintptr_t)p8 & ~0x3)); - uint32_t pos = ((uintptr_t)p8 & 0x3) * 8; + // https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#how-do-we-type-pun-correctly + // Comply with strict-aliasing rules. Using memcpy is a Standards suggested + // method for type punning. The compiler optimizer will replace the memcpy + // with an `l32i` instruction. Using __builtin_memcpy to ensure we get the + // effects of the compiler optimization and not some #define version of + // memcpy. + void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u); + uint32_t val; + __builtin_memcpy(&val, v32, sizeof(uint32_t)); + // Use an empty ASM to reference the 32-bit value. This will block the + // compiler from immediately optimizing to an 8-bit or 16-bit load instruction + // against IRAM memory. (This approach was inspired by + // https://github.com/esp8266/Arduino/pull/7780#discussion_r548303374) + // This issue was seen when using a constant address with the GCC 10.3 + // compiler. + // As a general practice, I think referencing by way of Extended ASM R/W + // output register will stop the the compiler from reloading the value later + // as 8-bit load from IRAM. + asm volatile ("" :"+r"(val)); // inject 32-bit dependency + uint32_t pos = ((uintptr_t)p8 & 3u) * 8u; val >>= pos; return (uint8_t)val; } @@ -136,8 +167,11 @@ uint8_t mmu_get_uint8(const void *p8) { static inline __attribute__((always_inline)) uint16_t mmu_get_uint16(const uint16_t *p16) { ASSERT_RANGE_TEST_READ(p16); - uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3)); - uint32_t pos = ((uintptr_t)p16 & 0x3) * 8; + void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)0x3u); + uint32_t val; + __builtin_memcpy(&val, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(val)); + uint32_t pos = ((uintptr_t)p16 & 3u) * 8u; val >>= pos; return (uint16_t)val; } @@ -145,8 +179,11 @@ uint16_t mmu_get_uint16(const uint16_t *p16) { static inline __attribute__((always_inline)) int16_t mmu_get_int16(const int16_t *p16) { ASSERT_RANGE_TEST_READ(p16); - uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3)); - uint32_t pos = ((uintptr_t)p16 & 0x3) * 8; + void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u); + uint32_t val; + __builtin_memcpy(&val, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(val)); + uint32_t pos = ((uintptr_t)p16 & 3u) * 8u; val >>= pos; return (int16_t)val; } @@ -154,30 +191,43 @@ int16_t mmu_get_int16(const int16_t *p16) { static inline __attribute__((always_inline)) uint8_t mmu_set_uint8(void *p8, const uint8_t val) { ASSERT_RANGE_TEST_WRITE(p8); - uint32_t pos = ((uintptr_t)p8 & 0x3) * 8; + uint32_t pos = ((uintptr_t)p8 & 3u) * 8u; uint32_t sval = val << pos; - uint32_t valmask = 0x0FF << pos; + uint32_t valmask = 0x0FFu << pos; + + void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u); + uint32_t ival; + __builtin_memcpy(&ival, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(ival)); - uint32_t *p32 = (uint32_t *)((uintptr_t)p8 & ~0x3); - uint32_t ival = *p32; ival &= (~valmask); ival |= sval; - *p32 = ival; + /* + This 32-bit dependency injection does not appear to be needed with the + current GCC 10.3; however, that could change in the future versions. Or, I + may not have the right test for it to fail. + */ + asm volatile ("" :"+r"(ival)); + __builtin_memcpy(v32, &ival, sizeof(uint32_t)); return val; } static inline __attribute__((always_inline)) uint16_t mmu_set_uint16(uint16_t *p16, const uint16_t val) { ASSERT_RANGE_TEST_WRITE(p16); - uint32_t pos = ((uintptr_t)p16 & 0x3) * 8; + uint32_t pos = ((uintptr_t)p16 & 3u) * 8u; uint32_t sval = val << pos; - uint32_t valmask = 0x0FFFF << pos; + uint32_t valmask = 0x0FFFFu << pos; + + void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u); + uint32_t ival; + __builtin_memcpy(&ival, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(ival)); - uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3); - uint32_t ival = *p32; ival &= (~valmask); ival |= sval; - *p32 = ival; + asm volatile ("" :"+r"(ival)); + __builtin_memcpy(v32, &ival, sizeof(uint32_t)); return val; } @@ -185,32 +235,36 @@ static inline __attribute__((always_inline)) int16_t mmu_set_int16(int16_t *p16, const int16_t val) { ASSERT_RANGE_TEST_WRITE(p16); uint32_t sval = (uint16_t)val; - uint32_t pos = ((uintptr_t)p16 & 0x3) * 8; + uint32_t pos = ((uintptr_t)p16 & 3u) * 8u; sval <<= pos; - uint32_t valmask = 0x0FFFF << pos; + uint32_t valmask = 0x0FFFFu << pos; + + void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u); + uint32_t ival; + __builtin_memcpy(&ival, v32, sizeof(uint32_t)); + asm volatile ("" :"+r"(ival)); - uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3); - uint32_t ival = *p32; ival &= (~valmask); ival |= sval; - *p32 = ival; + asm volatile ("" :"+r"(ival)); + __builtin_memcpy(v32, &ival, sizeof(uint32_t)); return val; } #if (MMU_IRAM_SIZE > 32*1024) && !defined(MMU_SEC_HEAP) -extern void _text_end(void); #define MMU_SEC_HEAP mmu_sec_heap() #define MMU_SEC_HEAP_SIZE mmu_sec_heap_size() static inline __attribute__((always_inline)) void *mmu_sec_heap(void) { - uint32_t sec_heap = (uint32_t)_text_end + 32; - return (void *)(sec_heap &= ~7); + extern void _text_end(void); + uintptr_t sec_heap = (uintptr_t)_text_end + (uintptr_t)32u; + return (void *)(sec_heap &= ~(uintptr_t)7u); } static inline __attribute__((always_inline)) size_t mmu_sec_heap_size(void) { - return (size_t)0xC000UL - ((size_t)mmu_sec_heap() - 0x40100000UL); + return (size_t)0xC000ul - ((uintptr_t)mmu_sec_heap() - (uintptr_t)XCHAL_INSTRAM1_VADDR); } #endif diff --git a/libraries/esp8266/examples/IramReserve/IramReserve.ino b/libraries/esp8266/examples/IramReserve/IramReserve.ino index 344622f958..96d7479da4 100644 --- a/libraries/esp8266/examples/IramReserve/IramReserve.ino +++ b/libraries/esp8266/examples/IramReserve/IramReserve.ino @@ -17,6 +17,12 @@ #include #if defined(UMM_HEAP_IRAM) +#if defined(CORE_MOCK) +#define XCHAL_INSTRAM1_VADDR 0x40100000 +#else +#include // For config/core-isa.h +#endif + // durable - as in long life, persisting across reboots. struct durable { uint32_t bootCounter; @@ -30,7 +36,7 @@ struct durable { #define IRAM_RESERVE_SZ ((sizeof(struct durable) + 7UL) & ~7UL) // Position its address just above the reduced 2nd Heap. -#define IRAM_RESERVE (0x40100000UL + 0xC000UL - IRAM_RESERVE_SZ) +#define IRAM_RESERVE ((uintptr_t)XCHAL_INSTRAM1_VADDR + 0xC000UL - IRAM_RESERVE_SZ) // Define a reference with the right properties to make access easier. #define DURABLE ((struct durable *)IRAM_RESERVE) @@ -100,9 +106,9 @@ extern "C" void umm_init_iram(void) { adjustments and checksums. These can affect the persistence of data across reboots. */ - uint32_t sec_heap = (uint32_t)_text_end + 32; + uintptr_t sec_heap = (uintptr_t)_text_end + 32; sec_heap &= ~7; - size_t sec_heap_sz = 0xC000UL - (sec_heap - 0x40100000UL); + size_t sec_heap_sz = 0xC000UL - (sec_heap - (uintptr_t)XCHAL_INSTRAM1_VADDR); sec_heap_sz -= IRAM_RESERVE_SZ; // Shrink IRAM heap if (0xC000UL > sec_heap_sz) { diff --git a/libraries/esp8266/examples/MMU48K/MMU48K.ino b/libraries/esp8266/examples/MMU48K/MMU48K.ino index 6acb3840f9..d75d91232b 100644 --- a/libraries/esp8266/examples/MMU48K/MMU48K.ino +++ b/libraries/esp8266/examples/MMU48K/MMU48K.ino @@ -3,6 +3,12 @@ #include #include +#if defined(CORE_MOCK) +#define XCHAL_INSTRAM1_VADDR 0x40100000 +#else +#include // For config/core-isa.h +#endif + uint32_t timed_byte_read(char *pc, uint32_t * o); uint32_t timed_byte_read2(char *pc, uint32_t * o); int divideA_B(int a, int b); @@ -102,7 +108,7 @@ void print_mmu_status(Print& oStream) { #ifdef MMU_IRAM_SIZE oStream.printf_P(PSTR(" IRAM Size: %u"), MMU_IRAM_SIZE); oStream.println(); - const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - 0x40100000UL); + const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - (uintptr_t)XCHAL_INSTRAM1_VADDR); oStream.printf_P(PSTR(" IRAM free: %u"), iram_free); oStream.println(); #endif diff --git a/libraries/esp8266/examples/irammem/irammem.ino b/libraries/esp8266/examples/irammem/irammem.ino index 169d53e05e..dccc34a6a4 100644 --- a/libraries/esp8266/examples/irammem/irammem.ino +++ b/libraries/esp8266/examples/irammem/irammem.ino @@ -14,6 +14,204 @@ #define ETS_PRINTF ets_uart_printf #endif +/* + Verify mmu_get_uint16()'s compliance with strict-aliasing rules under + different optimizations. +*/ + +#pragma GCC push_options +// reference +#pragma GCC optimize("O0") // We expect -O0 to generate the correct results +__attribute__((noinline)) +void aliasTestReference(uint16_t *x) { + // Without adhearance to strict-aliasing, this sequence of code would fail + // when optimized by GCC Version 10.3 + size_t len = 3; + for (size_t u = 0; u < len; u++) { + uint16_t x1 = mmu_get_uint16(&x[0]); + for (size_t v = 0; v < len; v++) { + x[v] = mmu_get_uint16(&x[v]) + x1; + } + } +} +// Tests +#pragma GCC optimize("Os") +__attribute__((noinline)) +void aliasTestOs(uint16_t *x) { + size_t len = 3; + for (size_t u = 0; u < len; u++) { + uint16_t x1 = mmu_get_uint16(&x[0]); + for (size_t v = 0; v < len; v++) { + x[v] = mmu_get_uint16(&x[v]) + x1; + } + } +} +#pragma GCC optimize("O2") +__attribute__((noinline)) +void aliasTestO2(uint16_t *x) { + size_t len = 3; + for (size_t u = 0; u < len; u++) { + uint16_t x1 = mmu_get_uint16(&x[0]); + for (size_t v = 0; v < len; v++) { + x[v] = mmu_get_uint16(&x[v]) + x1; + } + } +} +#pragma GCC optimize("O3") +__attribute__((noinline)) +void aliasTestO3(uint16_t *x) { + size_t len = 3; + for (size_t u = 0; u < len; u++) { + uint16_t x1 = mmu_get_uint16(&x[0]); + for (size_t v = 0; v < len; v++) { + x[v] = mmu_get_uint16(&x[v]) + x1; + } + } +} + +// Evaluate if optomizer may have changed 32-bit access to 8-bit. +// 8-bit access will take longer as it will be processed thought +// the exception handler. For this case the -O0 version will appear faster. +#pragma GCC optimize("O0") +__attribute__((noinline)) IRAM_ATTR +uint32_t timedRead_Reference(uint8_t *res) { + // This test case was verified with GCC 10.3 + // There is a code case that can result in 32-bit wide IRAM load from memory + // being optimized down to an 8-bit memory access. In this test case we need + // to supply a constant IRAM address that is not 0 when anded with 3u. + // This section verifies that the workaround implimented by the inline + // function mmu_get_uint8() is preventing this. See comments for function + // mmu_get_uint8(() in mmu_iram.h for more details. + const uint8_t *x = (const uint8_t *)0x40100003ul; + uint32_t b = ESP.getCycleCount(); + *res = mmu_get_uint8(x); + return ESP.getCycleCount() - b; +} +#pragma GCC optimize("Os") +__attribute__((noinline)) IRAM_ATTR +uint32_t timedRead_Os(uint8_t *res) { + const uint8_t *x = (const uint8_t *)0x40100003ul; + uint32_t b = ESP.getCycleCount(); + *res = mmu_get_uint8(x); + return ESP.getCycleCount() - b; +} +#pragma GCC optimize("O2") +__attribute__((noinline)) IRAM_ATTR +uint32_t timedRead_O2(uint8_t *res) { + const uint8_t *x = (const uint8_t *)0x40100003ul; + uint32_t b = ESP.getCycleCount(); + *res = mmu_get_uint8(x); + return ESP.getCycleCount() - b; +} +#pragma GCC optimize("O3") +__attribute__((noinline)) IRAM_ATTR +uint32_t timedRead_O3(uint8_t *res) { + const uint8_t *x = (const uint8_t *)0x40100003ul; + uint32_t b = ESP.getCycleCount(); + *res = mmu_get_uint8(x); + return ESP.getCycleCount() - b; +} +#pragma GCC pop_options + +bool test4_32bit_loads() { + bool result = true; + uint8_t res; + uint32_t cycle_count_ref, cycle_count; + Serial.printf("\r\nFor mmu_get_uint8, verify that 32-bit wide IRAM access is preserved across different optimizations:\r\n"); + cycle_count_ref = timedRead_Reference(&res); + /* + If the optimizer (for options -Os, -O2, and -O3) replaces the 32-bit wide + IRAM access with an 8-bit, the exception handler will get invoked on memory + reads. The total execution time will show a significant increase when + compared to the reference (option -O0). + */ + Serial.printf(" Option -O0, cycle count %5u - reference\r\n", cycle_count_ref); + cycle_count = timedRead_Os(&res); + Serial.printf(" Option -Os, cycle count %5u ", cycle_count); + if (cycle_count_ref > cycle_count) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + } + cycle_count = timedRead_O2(&res); + Serial.printf(" Option -O2, cycle count %5u ", cycle_count); + if (cycle_count_ref > cycle_count) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + } + cycle_count = timedRead_O3(&res); + Serial.printf(" Option -O3, cycle count %5u ", cycle_count); + if (cycle_count_ref > cycle_count) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + } + return result; +} + +void printPunFail(uint16_t *ref, uint16_t *x, size_t sz) { + Serial.printf(" Expected:"); + for (size_t i = 0; i < sz; i++) { + Serial.printf(" %3u", ref[i]); + } + Serial.printf("\r\n Got: "); + for (size_t i = 0; i < sz; i++) { + Serial.printf(" %3u", x[i]); + } + Serial.printf("\r\n"); +} + +bool testPunning() { + bool result = true; + // Get reference result for verifing test + alignas(uint32_t) uint16_t x_ref[] = {1, 2, 3, 0}; + aliasTestReference(x_ref); // -O0 + Serial.printf("mmu_get_uint16() strict-aliasing tests with different optimizations:\r\n"); + + { + alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0}; + aliasTestOs(x); + Serial.printf(" Option -Os "); + if (0 == memcmp(x_ref, x, sizeof(x_ref))) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t)); + } + } + { + alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0}; + aliasTestO2(x); + Serial.printf(" Option -O2 "); + if (0 == memcmp(x_ref, x, sizeof(x_ref))) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t)); + } + } + { + alignas(alignof(uint32_t)) uint16_t x[] = {1, 2, 3, 0}; + aliasTestO3(x); + Serial.printf(" Option -O3 "); + if (0 == memcmp(x_ref, x, sizeof(x_ref))) { + Serial.printf("- passed\r\n"); + } else { + result = false; + Serial.printf("- failed\r\n"); + printPunFail(x_ref, x, sizeof(x_ref) / sizeof(uint16_t)); + } + } + return result; +} + + uint32_t cyclesToRead_nKx32(int n, unsigned int *x, uint32_t *res) { uint32_t b = ESP.getCycleCount(); uint32_t sum = 0; @@ -95,7 +293,6 @@ uint32_t cyclesToWrite_nKx8(int n, unsigned char*x) { } // Compare with Inline - uint32_t cyclesToRead_nKx16_viaInline(int n, unsigned short *x, uint32_t *res) { uint32_t b = ESP.getCycleCount(); uint32_t sum = 0; @@ -159,6 +356,7 @@ uint32_t cyclesToWrite_nKx8_viaInline(int n, unsigned char*x) { return ESP.getCycleCount() - b; } + bool perfTest_nK(int nK, uint32_t *mem, uint32_t *imem) { uint32_t res, verify_res; uint32_t t; @@ -317,7 +515,7 @@ void setup() { Serial.println(); - if (perfTest_nK(1, mem, imem)) { + if (perfTest_nK(1, mem, imem) && testPunning() && test4_32bit_loads()) { Serial.println(); } else { Serial.println("\r\n*******************************"); pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy