Skip to content

Commit 9c54199

Browse files
committed
Attempt to fix JIT icache coherency on Arm64
The JIT compiler was experiencing intermittent failures on Arm64/Apple Silicon due to missing instruction cache invalidation after patching branch instructions. When update_branch_imm() modified branch targets in JIT-compiled code, the CPU's icache wasn't being invalidated, causing it to execute stale cached instructions instead of the newly patched ones. This manifested as non-deterministic test failures, particularly in compute-intensive benchmarks like the pi calculation test. The fix adds sys_icache_invalidate() after memcpy() in update_branch_imm to ensure the icache is synchronized with the data cache after code modification. This is critical on Arm64 cores which have separate L1 instruction and data caches.
1 parent 4b61b26 commit 9c54199

File tree

1 file changed

+51
-3
lines changed

1 file changed

+51
-3
lines changed

src/jit.c

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,10 @@ static void update_branch_imm(struct jit_state *state,
593593
assert((imm & 3) == 0);
594594
uint32_t insn;
595595
imm >>= 2;
596+
#if defined(__APPLE__) && defined(__aarch64__)
597+
/* Must be in write mode to read/write MAP_JIT memory on Apple ARM64 */
598+
pthread_jit_write_protect_np(false);
599+
#endif
596600
memcpy(&insn, state->buf + offset, sizeof(uint32_t));
597601
if ((insn & 0xfe000000U) == 0x54000000U /* Conditional branch immediate. */
598602
|| (insn & 0x7e000000U) ==
@@ -607,10 +611,8 @@ static void update_branch_imm(struct jit_state *state,
607611
assert(false);
608612
insn = BAD_OPCODE;
609613
}
610-
#if defined(__APPLE__) && defined(__aarch64__)
611-
pthread_jit_write_protect_np(false);
612-
#endif
613614
memcpy(state->buf + offset, &insn, sizeof(uint32_t));
615+
sys_icache_invalidate(state->buf + offset, sizeof(uint32_t));
614616
#if defined(__APPLE__) && defined(__aarch64__)
615617
pthread_jit_write_protect_np(true);
616618
#endif
@@ -2160,13 +2162,31 @@ static const void *dispatch_table[] = {
21602162
void clear_hot(block_t *block)
21612163
{
21622164
block->hot = false;
2165+
/* Note: Don't clear offset here - it causes issues with F extension disabled.
2166+
* The offset will be overwritten when the block is re-translated anyway.
2167+
*/
21632168
}
21642169

21652170
static void code_cache_flush(struct jit_state *state, riscv_t *rv)
21662171
{
21672172
should_flush = false;
2173+
/* Invalidate the entire code cache area that will be reused */
2174+
if (state->offset > state->org_size) {
2175+
#if defined(__APPLE__) && defined(__aarch64__)
2176+
pthread_jit_write_protect_np(false);
2177+
#endif
2178+
/* Clear the code cache area to prevent stale code execution */
2179+
memset(state->buf + state->org_size, 0,
2180+
state->offset - state->org_size);
2181+
sys_icache_invalidate(state->buf + state->org_size,
2182+
state->offset - state->org_size);
2183+
#if defined(__APPLE__) && defined(__aarch64__)
2184+
pthread_jit_write_protect_np(true);
2185+
#endif
2186+
}
21682187
state->offset = state->org_size;
21692188
state->n_blocks = 0;
2189+
state->n_jumps = 0; /* Reset jump count when flushing */
21702190
set_reset(&state->set);
21712191
clear_cache_hot(rv->block_cache, (clear_func_t) clear_hot);
21722192
#if RV32_HAS(T2C)
@@ -2229,6 +2249,7 @@ static void resolve_jumps(struct jit_state *state)
22292249

22302250
uint8_t *offset_ptr = &state->buf[jump.offset_loc];
22312251
memcpy(offset_ptr, &rel, sizeof(uint32_t));
2252+
sys_icache_invalidate(offset_ptr, sizeof(uint32_t));
22322253
#elif defined(__aarch64__)
22332254
int32_t rel = target_loc - jump.offset_loc;
22342255
update_branch_imm(state, jump.offset_loc, rel);
@@ -2318,12 +2339,39 @@ void jit_translate(riscv_t *rv, block_t *block)
23182339
memset(state->jumps, 0, MAX_JUMPS * sizeof(struct jump));
23192340
state->n_jumps = 0;
23202341
block->offset = state->offset;
2342+
uint32_t translation_start = state->offset;
23212343
translate_chained_block(state, rv, block);
23222344
if (unlikely(should_flush)) {
2345+
/* Reset block offset since translation was incomplete */
2346+
block->offset = 0;
2347+
block->hot = false;
23232348
code_cache_flush(state, rv);
23242349
goto restart;
23252350
}
23262351
resolve_jumps(state);
2352+
/* Ensure all instruction cache is synchronized after translation */
2353+
if (state->offset > translation_start) {
2354+
#if defined(__APPLE__) && defined(__aarch64__)
2355+
/* Must be in write mode to invalidate cache on Apple ARM64 */
2356+
pthread_jit_write_protect_np(false);
2357+
#endif
2358+
#if defined(__aarch64__)
2359+
/* ARM64 needs data synchronization before cache invalidation */
2360+
__asm__ __volatile__("dsb sy" ::: "memory");
2361+
#endif
2362+
sys_icache_invalidate(state->buf + translation_start,
2363+
state->offset - translation_start);
2364+
#if defined(__aarch64__)
2365+
/* ARM64 needs instruction synchronization after cache invalidation */
2366+
__asm__ __volatile__("isb" ::: "memory");
2367+
#endif
2368+
#if defined(__APPLE__) && defined(__aarch64__)
2369+
/* Re-enable execution mode */
2370+
pthread_jit_write_protect_np(true);
2371+
#endif
2372+
}
2373+
/* Memory barrier to ensure all writes complete before marking hot */
2374+
__asm__ __volatile__("" ::: "memory");
23272375
block->hot = true;
23282376
}
23292377

0 commit comments

Comments
 (0)