Skip to content

Commit 2634c1e

Browse files
committed
Attempt to fix JIT icache coherency on Arm64
The JIT compiler was experiencing intermittent failures on Arm64/Apple Silicon due to missing instruction cache invalidation after patching branch instructions. When update_branch_imm() modified branch targets in JIT-compiled code, the CPU's icache wasn't being invalidated, causing it to execute stale cached instructions instead of the newly patched ones. This manifested as non-deterministic test failures, particularly in compute-intensive benchmarks like the pi calculation test. The fix adds sys_icache_invalidate() after memcpy() in update_branch_imm to ensure the icache is synchronized with the data cache after code modification. This is critical on Arm64 cores which have separate L1 instruction and data caches.
1 parent 4b61b26 commit 2634c1e

File tree

1 file changed

+27
-0
lines changed

1 file changed

+27
-0
lines changed

src/jit.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,7 @@ static void update_branch_imm(struct jit_state *state,
611611
pthread_jit_write_protect_np(false);
612612
#endif
613613
memcpy(state->buf + offset, &insn, sizeof(uint32_t));
614+
sys_icache_invalidate(state->buf + offset, sizeof(uint32_t));
614615
#if defined(__APPLE__) && defined(__aarch64__)
615616
pthread_jit_write_protect_np(true);
616617
#endif
@@ -2167,6 +2168,7 @@ static void code_cache_flush(struct jit_state *state, riscv_t *rv)
21672168
should_flush = false;
21682169
state->offset = state->org_size;
21692170
state->n_blocks = 0;
2171+
state->n_jumps = 0; /* Reset jump count when flushing */
21702172
set_reset(&state->set);
21712173
clear_cache_hot(rv->block_cache, (clear_func_t) clear_hot);
21722174
#if RV32_HAS(T2C)
@@ -2229,6 +2231,7 @@ static void resolve_jumps(struct jit_state *state)
22292231

22302232
uint8_t *offset_ptr = &state->buf[jump.offset_loc];
22312233
memcpy(offset_ptr, &rel, sizeof(uint32_t));
2234+
sys_icache_invalidate(offset_ptr, sizeof(uint32_t));
22322235
#elif defined(__aarch64__)
22332236
int32_t rel = target_loc - jump.offset_loc;
22342237
update_branch_imm(state, jump.offset_loc, rel);
@@ -2318,12 +2321,36 @@ void jit_translate(riscv_t *rv, block_t *block)
23182321
memset(state->jumps, 0, MAX_JUMPS * sizeof(struct jump));
23192322
state->n_jumps = 0;
23202323
block->offset = state->offset;
2324+
uint32_t translation_start = state->offset;
23212325
translate_chained_block(state, rv, block);
23222326
if (unlikely(should_flush)) {
23232327
code_cache_flush(state, rv);
23242328
goto restart;
23252329
}
23262330
resolve_jumps(state);
2331+
/* Ensure all instruction cache is synchronized after translation */
2332+
if (state->offset > translation_start) {
2333+
#if defined(__APPLE__) && defined(__aarch64__)
2334+
/* Must be in write mode to invalidate cache on Apple ARM64 */
2335+
pthread_jit_write_protect_np(false);
2336+
#endif
2337+
#if defined(__aarch64__)
2338+
/* ARM64 needs data synchronization before cache invalidation */
2339+
__asm__ __volatile__("dsb sy" ::: "memory");
2340+
#endif
2341+
sys_icache_invalidate(state->buf + translation_start,
2342+
state->offset - translation_start);
2343+
#if defined(__aarch64__)
2344+
/* ARM64 needs instruction synchronization after cache invalidation */
2345+
__asm__ __volatile__("isb" ::: "memory");
2346+
#endif
2347+
#if defined(__APPLE__) && defined(__aarch64__)
2348+
/* Re-enable execution mode */
2349+
pthread_jit_write_protect_np(true);
2350+
#endif
2351+
}
2352+
/* Memory barrier to ensure all writes complete before marking hot */
2353+
__asm__ __volatile__("" ::: "memory");
23272354
block->hot = true;
23282355
}
23292356

0 commit comments

Comments
 (0)