From e9fa09a952693aa3591aca8d81259ad9fef8041f Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sat, 1 Nov 2025 03:21:51 +0800 Subject: [PATCH 1/3] =?UTF-8?q?Upgrade=20MMU=20cache=20to=208=C3=972=20set?= =?UTF-8?q?-associative?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements 8-set × 2-way set-associative cache for both load and store operations, replacing the previous direct-mapped design. This provides better hit rates while maintaining code simplicity. - Load cache: 65% → 98% hit rate (2-entry → 8×2 set-associative) - Store cache: 83% → 99% hit rate (1-entry → 8×2 set-associative) - 3-bit parity hash for even distribution across 8 sets - Simple 1-bit LRU for replacement policy - 94% reduction in store cache misses Memory cost: +512 bytes per hart (256B for load + 256B for store) --- main.c | 52 +++++++++++++++++++++------- riscv.c | 105 ++++++++++++++++++++++++++++++++++++++++---------------- riscv.h | 13 +++++-- 3 files changed, 126 insertions(+), 44 deletions(-) diff --git a/main.c b/main.c index 2848478..9e52db6 100644 --- a/main.c +++ b/main.c @@ -910,6 +910,18 @@ static int semu_step(emu_state_t *emu) } #ifdef MMU_CACHE_STATS +static vm_t *global_vm_for_signal = NULL; + +/* Forward declaration */ +static void print_mmu_cache_stats(vm_t *vm); + +static void signal_handler_stats(int sig UNUSED) +{ + if (global_vm_for_signal) + print_mmu_cache_stats(global_vm_for_signal); + exit(0); +} + static void print_mmu_cache_stats(vm_t *vm) { fprintf(stderr, "\n=== MMU Cache Statistics ===\n"); @@ -918,15 +930,25 @@ static void print_mmu_cache_stats(vm_t *vm) uint64_t fetch_total = hart->cache_fetch.hits + hart->cache_fetch.misses; - /* Combine 2-way load cache statistics */ - uint64_t load_hits = - hart->cache_load[0].hits + hart->cache_load[1].hits; - uint64_t load_misses = - hart->cache_load[0].misses + hart->cache_load[1].misses; + /* Combine 8-set × 2-way load cache statistics */ + uint64_t load_hits = 0, load_misses = 0; + for (int set = 0; set < 8; set++) { + for (int way = 0; way < 2; way++) { + load_hits += hart->cache_load[set].ways[way].hits; + load_misses += hart->cache_load[set].ways[way].misses; + } + } uint64_t load_total = load_hits + load_misses; - uint64_t store_total = - hart->cache_store.hits + hart->cache_store.misses; + /* Combine 8-set × 2-way store cache statistics */ + uint64_t store_hits = 0, store_misses = 0; + for (int set = 0; set < 8; set++) { + for (int way = 0; way < 2; way++) { + store_hits += hart->cache_store[set].ways[way].hits; + store_misses += hart->cache_store[set].ways[way].misses; + } + } + uint64_t store_total = store_hits + store_misses; fprintf(stderr, "\nHart %u:\n", i); fprintf(stderr, " Fetch: %12llu hits, %12llu misses", @@ -936,18 +958,18 @@ static void print_mmu_cache_stats(vm_t *vm) 100.0 * hart->cache_fetch.hits / fetch_total); fprintf(stderr, "\n"); - fprintf(stderr, " Load: %12llu hits, %12llu misses (2-way)", - load_hits, load_misses); + fprintf(stderr, " Load: %12llu hits, %12llu misses (8x2)", load_hits, + load_misses); if (load_total > 0) fprintf(stderr, " (%.2f%% hit rate)", 100.0 * load_hits / load_total); fprintf(stderr, "\n"); - fprintf(stderr, " Store: %12llu hits, %12llu misses", - hart->cache_store.hits, hart->cache_store.misses); + fprintf(stderr, " Store: %12llu hits, %12llu misses (8x2)", store_hits, + store_misses); if (store_total > 0) fprintf(stderr, " (%.2f%% hit rate)", - 100.0 * hart->cache_store.hits / store_total); + 100.0 * store_hits / store_total); fprintf(stderr, "\n"); } } @@ -1246,6 +1268,12 @@ int main(int argc, char **argv) if (ret) return ret; +#ifdef MMU_CACHE_STATS + global_vm_for_signal = &emu.vm; + signal(SIGINT, signal_handler_stats); + signal(SIGTERM, signal_handler_stats); +#endif + if (emu.debug) ret = semu_run_debug(&emu); else diff --git a/riscv.c b/riscv.c index c07254c..c222671 100644 --- a/riscv.c +++ b/riscv.c @@ -185,9 +185,18 @@ static inline uint32_t read_rs2(const hart_t *vm, uint32_t insn) void mmu_invalidate(hart_t *vm) { vm->cache_fetch.n_pages = 0xFFFFFFFF; - vm->cache_load[0].n_pages = 0xFFFFFFFF; - vm->cache_load[1].n_pages = 0xFFFFFFFF; - vm->cache_store.n_pages = 0xFFFFFFFF; + /* Invalidate all 8 sets × 2 ways for load cache */ + for (int set = 0; set < 8; set++) { + for (int way = 0; way < 2; way++) + vm->cache_load[set].ways[way].n_pages = 0xFFFFFFFF; + vm->cache_load[set].lru = 0; /* Reset LRU to way 0 */ + } + /* Invalidate all 8 sets × 2 ways for store cache */ + for (int set = 0; set < 8; set++) { + for (int way = 0; way < 2; way++) + vm->cache_store[set].ways[way].n_pages = 0xFFFFFFFF; + vm->cache_store[set].lru = 0; /* Reset LRU to way 0 */ + } } /* Pre-verify the root page table to minimize page table access during @@ -333,13 +342,36 @@ static void mmu_load(hart_t *vm, { uint32_t vpn = addr >> RV_PAGE_SHIFT; uint32_t phys_addr; - /* 2-entry direct-mapped cache: use parity hash to select entry */ - uint32_t index = __builtin_parity(vpn) & 0x1; + /* 8-set × 2-way set-associative cache: use 3-bit parity hash */ + uint32_t set_idx = (__builtin_parity(vpn & 0xAAAAAAAA) << 2) | + (__builtin_parity(vpn & 0x55555555) << 1) | + __builtin_parity(vpn & 0xCCCCCCCC); + + mmu_cache_set_t *set = &vm->cache_load[set_idx]; + + /* Check both ways in the set */ + int hit_way = -1; + for (int way = 0; way < 2; way++) { + if (likely(set->ways[way].n_pages == vpn)) { + hit_way = way; + break; + } + } - if (unlikely(vpn != vm->cache_load[index].n_pages)) { + if (likely(hit_way >= 0)) { + /* Cache hit: reconstruct physical address from cached PPN */ +#ifdef MMU_CACHE_STATS + set->ways[hit_way].hits++; +#endif + phys_addr = (set->ways[hit_way].phys_ppn << RV_PAGE_SHIFT) | + (addr & MASK(RV_PAGE_SHIFT)); + /* Update LRU: mark the other way as replacement candidate */ + set->lru = 1 - hit_way; + } else { /* Cache miss: do full translation */ + int victim_way = set->lru; /* Use LRU bit to select victim */ #ifdef MMU_CACHE_STATS - vm->cache_load[index].misses++; + set->ways[victim_way].misses++; #endif phys_addr = addr; mmu_translate(vm, &phys_addr, @@ -348,16 +380,11 @@ static void mmu_load(hart_t *vm, RV_EXC_LOAD_PFAULT); if (vm->error) return; - /* Cache physical page number (not a pointer) */ - vm->cache_load[index].n_pages = vpn; - vm->cache_load[index].phys_ppn = phys_addr >> RV_PAGE_SHIFT; - } else { - /* Cache hit: reconstruct physical address from cached PPN */ -#ifdef MMU_CACHE_STATS - vm->cache_load[index].hits++; -#endif - phys_addr = (vm->cache_load[index].phys_ppn << RV_PAGE_SHIFT) | - (addr & MASK(RV_PAGE_SHIFT)); + /* Replace victim way with new translation */ + set->ways[victim_way].n_pages = vpn; + set->ways[victim_way].phys_ppn = phys_addr >> RV_PAGE_SHIFT; + /* Update LRU: mark the other way for next eviction */ + set->lru = 1 - victim_way; } vm->mem_load(vm, phys_addr, width, value); @@ -376,11 +403,36 @@ static bool mmu_store(hart_t *vm, { uint32_t vpn = addr >> RV_PAGE_SHIFT; uint32_t phys_addr; + /* 8-set × 2-way set-associative cache: use 3-bit parity hash */ + uint32_t set_idx = (__builtin_parity(vpn & 0xAAAAAAAA) << 2) | + (__builtin_parity(vpn & 0x55555555) << 1) | + __builtin_parity(vpn & 0xCCCCCCCC); + + mmu_cache_set_t *set = &vm->cache_store[set_idx]; + + /* Check both ways in the set */ + int hit_way = -1; + for (int way = 0; way < 2; way++) { + if (likely(set->ways[way].n_pages == vpn)) { + hit_way = way; + break; + } + } - if (unlikely(vpn != vm->cache_store.n_pages)) { + if (likely(hit_way >= 0)) { + /* Cache hit: reconstruct physical address from cached PPN */ +#ifdef MMU_CACHE_STATS + set->ways[hit_way].hits++; +#endif + phys_addr = (set->ways[hit_way].phys_ppn << RV_PAGE_SHIFT) | + (addr & MASK(RV_PAGE_SHIFT)); + /* Update LRU: mark the other way as replacement candidate */ + set->lru = 1 - hit_way; + } else { /* Cache miss: do full translation */ + int victim_way = set->lru; /* Use LRU bit to select victim */ #ifdef MMU_CACHE_STATS - vm->cache_store.misses++; + set->ways[victim_way].misses++; #endif phys_addr = addr; mmu_translate(vm, &phys_addr, (1 << 2), (1 << 6) | (1 << 7), @@ -388,16 +440,11 @@ static bool mmu_store(hart_t *vm, RV_EXC_STORE_PFAULT); if (vm->error) return false; - /* Cache physical page number (not a pointer) */ - vm->cache_store.n_pages = vpn; - vm->cache_store.phys_ppn = phys_addr >> RV_PAGE_SHIFT; - } else { - /* Cache hit: reconstruct physical address from cached PPN */ -#ifdef MMU_CACHE_STATS - vm->cache_store.hits++; -#endif - phys_addr = (vm->cache_store.phys_ppn << RV_PAGE_SHIFT) | - (addr & MASK(RV_PAGE_SHIFT)); + /* Replace victim way with new translation */ + set->ways[victim_way].n_pages = vpn; + set->ways[victim_way].phys_ppn = phys_addr >> RV_PAGE_SHIFT; + /* Update LRU: mark the other way for next eviction */ + set->lru = 1 - victim_way; } if (unlikely(cond)) { diff --git a/riscv.h b/riscv.h index 0563f8f..813380e 100644 --- a/riscv.h +++ b/riscv.h @@ -51,6 +51,12 @@ typedef struct { #endif } mmu_addr_cache_t; +/* Set-associative cache structure for load operations */ +typedef struct { + mmu_addr_cache_t ways[2]; /* 2-way associative */ + uint8_t lru; /* LRU bit: 0 or 1 (which way to replace) */ +} mmu_cache_set_t; + /* To use the emulator, start by initializing a hart_t object with zero values, * invoke vm_init(), and set the required environment-supplied callbacks. You * may also set other necessary fields such as argument registers and s_mode, @@ -101,9 +107,10 @@ struct __hart_internal { uint32_t exc_cause, exc_val; mmu_fetch_cache_t cache_fetch; - /* 2-entry direct-mapped with hash-based indexing */ - mmu_addr_cache_t cache_load[2]; - mmu_addr_cache_t cache_store; + /* 8-set × 2-way set-associative cache with 3-bit parity hash indexing */ + mmu_cache_set_t cache_load[8]; + /* 8-set × 2-way set-associative cache for store operations */ + mmu_cache_set_t cache_store[8]; /* Supervisor state */ bool s_mode; From 8d8e6b36385c808afc69c3c6b8d6e816c7b80224 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sat, 1 Nov 2025 04:10:54 +0800 Subject: [PATCH 2/3] Fix async-signal-safety in MMU cache stat handler The signal handler for SIGINT/SIGTERM was calling fprintf(), which is not async-signal-safe and can lead to deadlocks or data corruption. - Use volatile sig_atomic_t flag instead of calling fprintf directly - Signal handler now only sets the flag (async-signal-safe) - Main loops check the flag and print statistics when safe - Applies to both SMP (coroutine) and single-hart execution paths --- main.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/main.c b/main.c index 9e52db6..c5ffbf6 100644 --- a/main.c +++ b/main.c @@ -911,15 +911,15 @@ static int semu_step(emu_state_t *emu) #ifdef MMU_CACHE_STATS static vm_t *global_vm_for_signal = NULL; +static volatile sig_atomic_t signal_received = 0; /* Forward declaration */ static void print_mmu_cache_stats(vm_t *vm); +/* Async-signal-safe handler: only set flag, defer printing */ static void signal_handler_stats(int sig UNUSED) { - if (global_vm_for_signal) - print_mmu_cache_stats(global_vm_for_signal); - exit(0); + signal_received = 1; } static void print_mmu_cache_stats(vm_t *vm) @@ -1029,6 +1029,13 @@ static int semu_run(emu_state_t *emu) #endif while (!emu->stopped) { +#ifdef MMU_CACHE_STATS + /* Check if signal received (SIGINT/SIGTERM) */ + if (signal_received) { + print_mmu_cache_stats(&emu->vm); + return 0; + } +#endif /* Resume each hart's coroutine in round-robin fashion */ for (uint32_t i = 0; i < vm->n_hart; i++) { coro_resume_hart(i); @@ -1122,6 +1129,11 @@ static int semu_run(emu_state_t *emu) if (ret) return ret; #ifdef MMU_CACHE_STATS + /* Check if signal received (SIGINT/SIGTERM) */ + if (signal_received) { + print_mmu_cache_stats(&emu->vm); + return 0; + } /* Exit after running for 15 seconds to collect statistics */ gettimeofday(¤t_time, NULL); long elapsed_sec = current_time.tv_sec - start_time.tv_sec; From c7728b88e9d1f0e8052ee0ecca82e6f9402fb094 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Sat, 1 Nov 2025 06:27:10 +0800 Subject: [PATCH 3/3] Recalibrate timer coefficient after MMU cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After MMU cache optimization (8×2 set-associative, 99%+ hit rate), CPU execution became faster, reducing timer calls by 18.9%. Update coefficient from 2.15e8 to 1.744e8 based on measurements. Add calibration statistics (enabled via SEMU_TIMER_STATS) to help future recalibration if needed. --- utils.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/utils.c b/utils.c index 1094ccb..04dedff 100644 --- a/utils.c +++ b/utils.c @@ -1,4 +1,6 @@ +#include #include +#include #include #include "utils.h" @@ -24,6 +26,10 @@ bool boot_complete = false; static double ticks_increment; static double boot_ticks; +/* Timer calibration statistics */ +static uint64_t timer_call_count = 0; +static int timer_n_harts = 1; + /* Calculate "x * n / d" without unnecessary overflow or loss of precision. * * Reference: @@ -88,6 +94,7 @@ static uint64_t semu_timer_clocksource(semu_timer_t *timer) static bool first_switch = true; if (!boot_complete) { + timer_call_count++; boot_ticks += ticks_increment; return (uint64_t) boot_ticks; } @@ -98,6 +105,34 @@ static uint64_t semu_timer_clocksource(semu_timer_t *timer) /* Calculate the offset between the real time and the emulator time */ offset = (int64_t) (real_ticks - boot_ticks); + +#ifdef SEMU_TIMER_STATS + /* Output timer calibration statistics (only when SEMU_TIMER_STATS is + * defined) */ + double actual_coefficient = (double) timer_call_count / timer_n_harts; + double current_coefficient = 1.744e8; + double recommended_coefficient = actual_coefficient; + + fprintf(stderr, "\n[Timer Calibration Statistics]\n"); + fprintf(stderr, " Boot completed after %llu timer calls\n", + (unsigned long long) timer_call_count); + fprintf(stderr, " Number of harts: %d\n", timer_n_harts); + fprintf(stderr, " Actual coefficient: %.3e (%.2f calls per hart)\n", + actual_coefficient, actual_coefficient); + fprintf(stderr, " Current coefficient: %.3e\n", current_coefficient); + fprintf(stderr, " Difference: %.2f%% %s\n", + fabs(actual_coefficient - current_coefficient) / + current_coefficient * 100.0, + actual_coefficient > current_coefficient ? "(more calls)" + : "(fewer calls)"); + fprintf(stderr, "\n[Recommendation]\n"); + fprintf(stderr, " Update utils.c line 121 to:\n"); + fprintf(stderr, + " ticks_increment = (SEMU_BOOT_TARGET_TIME * CLOCK_FREQ) / " + "(%.3e * n_harts);\n", + recommended_coefficient); + fprintf(stderr, "\n"); +#endif } return (uint64_t) ((int64_t) real_ticks - offset); } @@ -108,14 +143,28 @@ void semu_timer_init(semu_timer_t *timer, uint64_t freq, int n_harts) timer->begin = mult_frac(host_time_ns(), timer->freq, 1e9); boot_ticks = timer->begin; /* Initialize the fake ticks for boot process */ + /* Store n_harts for calibration statistics */ + timer_n_harts = n_harts; + /* According to statistics, the number of times 'semu_timer_clocksource' - * called is approximately 'SMP count * 2.15 * 1e8'. By the time the boot + * called is approximately 'SMP count * 1.744 * 1e8'. By the time the boot * process is completed, the emulator will have a total of 'boot seconds * - * frequency' ticks. Therefore, each time, '(boot seconds * frequency) / - * (2.15 * 1e8 * SMP count)' ticks need to be added. + * frequency' ticks. Therefore, each time, (boot seconds * frequency) / + * (1.744 * 1e8 * SMP count) ticks need to be added. + * + * Note: This coefficient was recalibrated after MMU cache optimization + * (8×2 set-associative with 99%+ hit rate). The original coefficient + * (2.15 * 1e8) was based on measurements before the optimization. With + * faster CPU execution, fewer timer calls are needed to complete boot. + * + * Calibration history: + * - Original (pre-MMU cache): 2.15 × 10^8 + * - After MMU cache (measured): 1.696 × 10^8 (-21.1%) + * - Verification measurement: 1.744 × 10^8 (error: 2.85%) + * - Final coefficient: 1.744 × 10^8 (based on verification) */ ticks_increment = - (SEMU_BOOT_TARGET_TIME * CLOCK_FREQ) / (2.15 * 1e8 * n_harts); + (SEMU_BOOT_TARGET_TIME * CLOCK_FREQ) / (1.744 * 1e8 * n_harts); } uint64_t semu_timer_get(semu_timer_t *timer)