Skip to content

Commit b7a2f44

Browse files
authored
Update mmu_get... and mmu_set... (#8290)
These changes are needed to address bugs that can emerge with the improved optimization from the GCC 10.3 compiler. Updated performance inline functions `mmu_get_uint8()`, ... and `mmu_set_uint8()`, ... to comply with strict-aliasing rules. Without this change, stale data may be referenced. This issue was revealed in discussions on #8261 (comment) Changes to avoid over-optimization of 32-bit wide transfers from IRAM, turning into 8-bit or 16-bit transfers by the new GCC 10.3 compiler. This has been a reoccurring/tricky problem for me with the new compiler. So far referencing the 32-bit value loaded by way of an Extended ASM R/W output register has stopped the compiler from optimizing down to an 8-bit or 16-bit transfer. Example: ```cpp uint32_t val; __builtin_memcpy(&val, v32, sizeof(uint32_t)); asm volatile ("" :"+r"(val)); // inject 32-bit dependency ... ``` Updated example `irammem.ino` * do a simple test of compliance to strict-aliasing rules * For `mmu_get_uint8()`, added tests to evaluate if 32-bit wide transfers were converted to an 8-bit transfer.
1 parent 9d024d1 commit b7a2f44

File tree

4 files changed

+306
-42
lines changed

4 files changed

+306
-42
lines changed

cores/esp8266/mmu_iram.h

Lines changed: 90 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,24 @@
2626
extern "C" {
2727
#endif
2828

29-
//C This turns on range checking. Is this the value you want to trigger it?
29+
// This turns on range checking.
3030
#ifdef DEBUG_ESP_CORE
3131
#define DEBUG_ESP_MMU
3232
#endif
3333

3434
#if defined(CORE_MOCK)
3535
#define ets_uart_printf(...) do {} while(false)
36+
#define XCHAL_INSTRAM0_VADDR 0x40000000
37+
#define XCHAL_INSTRAM1_VADDR 0x40100000
38+
#define XCHAL_INSTROM0_VADDR 0x40200000
39+
#else
40+
#include <sys/config.h> // For config/core-isa.h
41+
/*
42+
Cautiously use XCHAL_..._VADDR values where possible.
43+
While XCHAL_..._VADDR values in core-isa.h may define the Xtensa processor
44+
CONFIG options, they are not always an indication of DRAM, IRAM, or ROM
45+
size or position in the address space.
46+
*/
3647
#endif
3748

3849
/*
@@ -71,32 +82,34 @@ DBG_MMU_FLUSH(0)
7182

7283
static inline __attribute__((always_inline))
7384
bool mmu_is_iram(const void *addr) {
74-
#define IRAM_START 0x40100000UL
85+
const uintptr_t iram_start = (uintptr_t)XCHAL_INSTRAM1_VADDR;
7586
#ifndef MMU_IRAM_SIZE
7687
#if defined(__GNUC__) && !defined(CORE_MOCK)
7788
#warning "MMU_IRAM_SIZE was undefined, setting to 0x8000UL!"
7889
#endif
79-
#define MMU_IRAM_SIZE 0x8000UL
90+
#define MMU_IRAM_SIZE 0x8000ul
8091
#endif
81-
#define IRAM_END (IRAM_START + MMU_IRAM_SIZE)
92+
const uintptr_t iram_end = iram_start + MMU_IRAM_SIZE;
8293

83-
return (IRAM_START <= (uintptr_t)addr && IRAM_END > (uintptr_t)addr);
94+
return (iram_start <= (uintptr_t)addr && iram_end > (uintptr_t)addr);
8495
}
8596

8697
static inline __attribute__((always_inline))
8798
bool mmu_is_dram(const void *addr) {
88-
#define DRAM_START 0x3FF80000UL
89-
#define DRAM_END 0x40000000UL
99+
const uintptr_t dram_start = 0x3FFE8000ul;
100+
// The start of the Boot ROM sits at the end of DRAM. 0x40000000ul;
101+
const uintptr_t dram_end = (uintptr_t)XCHAL_INSTRAM0_VADDR;
90102

91-
return (DRAM_START <= (uintptr_t)addr && DRAM_END > (uintptr_t)addr);
103+
return (dram_start <= (uintptr_t)addr && dram_end > (uintptr_t)addr);
92104
}
93105

94106
static inline __attribute__((always_inline))
95107
bool mmu_is_icache(const void *addr) {
96-
#define ICACHE_START 0x40200000UL
97-
#define ICACHE_END (ICACHE_START + 0x100000UL)
108+
extern void _irom0_text_end(void);
109+
const uintptr_t icache_start = (uintptr_t)XCHAL_INSTROM0_VADDR;
110+
const uintptr_t icache_end = (uintptr_t)_irom0_text_end;
98111

99-
return (ICACHE_START <= (uintptr_t)addr && ICACHE_END > (uintptr_t)addr);
112+
return (icache_start <= (uintptr_t)addr && icache_end > (uintptr_t)addr);
100113
}
101114

102115
#ifdef DEBUG_ESP_MMU
@@ -127,90 +140,131 @@ bool mmu_is_icache(const void *addr) {
127140
static inline __attribute__((always_inline))
128141
uint8_t mmu_get_uint8(const void *p8) {
129142
ASSERT_RANGE_TEST_READ(p8);
130-
uint32_t val = (*(uint32_t *)((uintptr_t)p8 & ~0x3));
131-
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
143+
// https://gist.github.com/shafik/848ae25ee209f698763cffee272a58f8#how-do-we-type-pun-correctly
144+
// Comply with strict-aliasing rules. Using memcpy is a Standards suggested
145+
// method for type punning. The compiler optimizer will replace the memcpy
146+
// with an `l32i` instruction. Using __builtin_memcpy to ensure we get the
147+
// effects of the compiler optimization and not some #define version of
148+
// memcpy.
149+
void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
150+
uint32_t val;
151+
__builtin_memcpy(&val, v32, sizeof(uint32_t));
152+
// Use an empty ASM to reference the 32-bit value. This will block the
153+
// compiler from immediately optimizing to an 8-bit or 16-bit load instruction
154+
// against IRAM memory. (This approach was inspired by
155+
// https://github.com/esp8266/Arduino/pull/7780#discussion_r548303374)
156+
// This issue was seen when using a constant address with the GCC 10.3
157+
// compiler.
158+
// As a general practice, I think referencing by way of Extended ASM R/W
159+
// output register will stop the the compiler from reloading the value later
160+
// as 8-bit load from IRAM.
161+
asm volatile ("" :"+r"(val)); // inject 32-bit dependency
162+
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
132163
val >>= pos;
133164
return (uint8_t)val;
134165
}
135166

136167
static inline __attribute__((always_inline))
137168
uint16_t mmu_get_uint16(const uint16_t *p16) {
138169
ASSERT_RANGE_TEST_READ(p16);
139-
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
140-
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
170+
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)0x3u);
171+
uint32_t val;
172+
__builtin_memcpy(&val, v32, sizeof(uint32_t));
173+
asm volatile ("" :"+r"(val));
174+
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
141175
val >>= pos;
142176
return (uint16_t)val;
143177
}
144178

145179
static inline __attribute__((always_inline))
146180
int16_t mmu_get_int16(const int16_t *p16) {
147181
ASSERT_RANGE_TEST_READ(p16);
148-
uint32_t val = (*(uint32_t *)((uintptr_t)p16 & ~0x3));
149-
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
182+
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
183+
uint32_t val;
184+
__builtin_memcpy(&val, v32, sizeof(uint32_t));
185+
asm volatile ("" :"+r"(val));
186+
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
150187
val >>= pos;
151188
return (int16_t)val;
152189
}
153190

154191
static inline __attribute__((always_inline))
155192
uint8_t mmu_set_uint8(void *p8, const uint8_t val) {
156193
ASSERT_RANGE_TEST_WRITE(p8);
157-
uint32_t pos = ((uintptr_t)p8 & 0x3) * 8;
194+
uint32_t pos = ((uintptr_t)p8 & 3u) * 8u;
158195
uint32_t sval = val << pos;
159-
uint32_t valmask = 0x0FF << pos;
196+
uint32_t valmask = 0x0FFu << pos;
197+
198+
void *v32 = (void *)((uintptr_t)p8 & ~(uintptr_t)3u);
199+
uint32_t ival;
200+
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
201+
asm volatile ("" :"+r"(ival));
160202

161-
uint32_t *p32 = (uint32_t *)((uintptr_t)p8 & ~0x3);
162-
uint32_t ival = *p32;
163203
ival &= (~valmask);
164204
ival |= sval;
165-
*p32 = ival;
205+
/*
206+
This 32-bit dependency injection does not appear to be needed with the
207+
current GCC 10.3; however, that could change in the future versions. Or, I
208+
may not have the right test for it to fail.
209+
*/
210+
asm volatile ("" :"+r"(ival));
211+
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
166212
return val;
167213
}
168214

169215
static inline __attribute__((always_inline))
170216
uint16_t mmu_set_uint16(uint16_t *p16, const uint16_t val) {
171217
ASSERT_RANGE_TEST_WRITE(p16);
172-
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
218+
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
173219
uint32_t sval = val << pos;
174-
uint32_t valmask = 0x0FFFF << pos;
220+
uint32_t valmask = 0x0FFFFu << pos;
221+
222+
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
223+
uint32_t ival;
224+
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
225+
asm volatile ("" :"+r"(ival));
175226

176-
uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
177-
uint32_t ival = *p32;
178227
ival &= (~valmask);
179228
ival |= sval;
180-
*p32 = ival;
229+
asm volatile ("" :"+r"(ival));
230+
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
181231
return val;
182232
}
183233

184234
static inline __attribute__((always_inline))
185235
int16_t mmu_set_int16(int16_t *p16, const int16_t val) {
186236
ASSERT_RANGE_TEST_WRITE(p16);
187237
uint32_t sval = (uint16_t)val;
188-
uint32_t pos = ((uintptr_t)p16 & 0x3) * 8;
238+
uint32_t pos = ((uintptr_t)p16 & 3u) * 8u;
189239
sval <<= pos;
190-
uint32_t valmask = 0x0FFFF << pos;
240+
uint32_t valmask = 0x0FFFFu << pos;
241+
242+
void *v32 = (void *)((uintptr_t)p16 & ~(uintptr_t)3u);
243+
uint32_t ival;
244+
__builtin_memcpy(&ival, v32, sizeof(uint32_t));
245+
asm volatile ("" :"+r"(ival));
191246

192-
uint32_t *p32 = (uint32_t *)((uintptr_t)p16 & ~0x3);
193-
uint32_t ival = *p32;
194247
ival &= (~valmask);
195248
ival |= sval;
196-
*p32 = ival;
249+
asm volatile ("" :"+r"(ival));
250+
__builtin_memcpy(v32, &ival, sizeof(uint32_t));
197251
return val;
198252
}
199253

200254
#if (MMU_IRAM_SIZE > 32*1024) && !defined(MMU_SEC_HEAP)
201-
extern void _text_end(void);
202255
#define MMU_SEC_HEAP mmu_sec_heap()
203256
#define MMU_SEC_HEAP_SIZE mmu_sec_heap_size()
204257

205258
static inline __attribute__((always_inline))
206259
void *mmu_sec_heap(void) {
207-
uint32_t sec_heap = (uint32_t)_text_end + 32;
208-
return (void *)(sec_heap &= ~7);
260+
extern void _text_end(void);
261+
uintptr_t sec_heap = (uintptr_t)_text_end + (uintptr_t)32u;
262+
return (void *)(sec_heap &= ~(uintptr_t)7u);
209263
}
210264

211265
static inline __attribute__((always_inline))
212266
size_t mmu_sec_heap_size(void) {
213-
return (size_t)0xC000UL - ((size_t)mmu_sec_heap() - 0x40100000UL);
267+
return (size_t)0xC000ul - ((uintptr_t)mmu_sec_heap() - (uintptr_t)XCHAL_INSTRAM1_VADDR);
214268
}
215269
#endif
216270

libraries/esp8266/examples/IramReserve/IramReserve.ino

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
#include <umm_malloc/umm_malloc.h>
1818
#if defined(UMM_HEAP_IRAM)
1919

20+
#if defined(CORE_MOCK)
21+
#define XCHAL_INSTRAM1_VADDR 0x40100000
22+
#else
23+
#include <sys/config.h> // For config/core-isa.h
24+
#endif
25+
2026
// durable - as in long life, persisting across reboots.
2127
struct durable {
2228
uint32_t bootCounter;
@@ -30,7 +36,7 @@ struct durable {
3036
#define IRAM_RESERVE_SZ ((sizeof(struct durable) + 7UL) & ~7UL)
3137

3238
// Position its address just above the reduced 2nd Heap.
33-
#define IRAM_RESERVE (0x40100000UL + 0xC000UL - IRAM_RESERVE_SZ)
39+
#define IRAM_RESERVE ((uintptr_t)XCHAL_INSTRAM1_VADDR + 0xC000UL - IRAM_RESERVE_SZ)
3440

3541
// Define a reference with the right properties to make access easier.
3642
#define DURABLE ((struct durable *)IRAM_RESERVE)
@@ -100,9 +106,9 @@ extern "C" void umm_init_iram(void) {
100106
adjustments and checksums. These can affect the persistence of data across
101107
reboots.
102108
*/
103-
uint32_t sec_heap = (uint32_t)_text_end + 32;
109+
uintptr_t sec_heap = (uintptr_t)_text_end + 32;
104110
sec_heap &= ~7;
105-
size_t sec_heap_sz = 0xC000UL - (sec_heap - 0x40100000UL);
111+
size_t sec_heap_sz = 0xC000UL - (sec_heap - (uintptr_t)XCHAL_INSTRAM1_VADDR);
106112
sec_heap_sz -= IRAM_RESERVE_SZ; // Shrink IRAM heap
107113
if (0xC000UL > sec_heap_sz) {
108114

libraries/esp8266/examples/MMU48K/MMU48K.ino

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@
33
#include <umm_malloc/umm_malloc.h>
44
#include <umm_malloc/umm_heap_select.h>
55

6+
#if defined(CORE_MOCK)
7+
#define XCHAL_INSTRAM1_VADDR 0x40100000
8+
#else
9+
#include <sys/config.h> // For config/core-isa.h
10+
#endif
11+
612
uint32_t timed_byte_read(char *pc, uint32_t * o);
713
uint32_t timed_byte_read2(char *pc, uint32_t * o);
814
int divideA_B(int a, int b);
@@ -102,7 +108,7 @@ void print_mmu_status(Print& oStream) {
102108
#ifdef MMU_IRAM_SIZE
103109
oStream.printf_P(PSTR(" IRAM Size: %u"), MMU_IRAM_SIZE);
104110
oStream.println();
105-
const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - 0x40100000UL);
111+
const uint32_t iram_free = MMU_IRAM_SIZE - (uint32_t)((uintptr_t)_text_end - (uintptr_t)XCHAL_INSTRAM1_VADDR);
106112
oStream.printf_P(PSTR(" IRAM free: %u"), iram_free);
107113
oStream.println();
108114
#endif

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy