Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 52 additions & 12 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -910,6 +910,18 @@ static int semu_step(emu_state_t *emu)
}

#ifdef MMU_CACHE_STATS
static vm_t *global_vm_for_signal = NULL;
static volatile sig_atomic_t signal_received = 0;

/* Forward declaration */
static void print_mmu_cache_stats(vm_t *vm);

/* Async-signal-safe handler: only set flag, defer printing */
static void signal_handler_stats(int sig UNUSED)
{
signal_received = 1;
}

static void print_mmu_cache_stats(vm_t *vm)
{
fprintf(stderr, "\n=== MMU Cache Statistics ===\n");
Expand All @@ -918,15 +930,25 @@ static void print_mmu_cache_stats(vm_t *vm)
uint64_t fetch_total =
hart->cache_fetch.hits + hart->cache_fetch.misses;

/* Combine 2-way load cache statistics */
uint64_t load_hits =
hart->cache_load[0].hits + hart->cache_load[1].hits;
uint64_t load_misses =
hart->cache_load[0].misses + hart->cache_load[1].misses;
/* Combine 8-set × 2-way load cache statistics */
uint64_t load_hits = 0, load_misses = 0;
for (int set = 0; set < 8; set++) {
for (int way = 0; way < 2; way++) {
load_hits += hart->cache_load[set].ways[way].hits;
load_misses += hart->cache_load[set].ways[way].misses;
}
}
uint64_t load_total = load_hits + load_misses;

uint64_t store_total =
hart->cache_store.hits + hart->cache_store.misses;
/* Combine 8-set × 2-way store cache statistics */
uint64_t store_hits = 0, store_misses = 0;
for (int set = 0; set < 8; set++) {
for (int way = 0; way < 2; way++) {
store_hits += hart->cache_store[set].ways[way].hits;
store_misses += hart->cache_store[set].ways[way].misses;
}
}
uint64_t store_total = store_hits + store_misses;

fprintf(stderr, "\nHart %u:\n", i);
fprintf(stderr, " Fetch: %12llu hits, %12llu misses",
Expand All @@ -936,18 +958,18 @@ static void print_mmu_cache_stats(vm_t *vm)
100.0 * hart->cache_fetch.hits / fetch_total);
fprintf(stderr, "\n");

fprintf(stderr, " Load: %12llu hits, %12llu misses (2-way)",
load_hits, load_misses);
fprintf(stderr, " Load: %12llu hits, %12llu misses (8x2)", load_hits,
load_misses);
if (load_total > 0)
fprintf(stderr, " (%.2f%% hit rate)",
100.0 * load_hits / load_total);
fprintf(stderr, "\n");

fprintf(stderr, " Store: %12llu hits, %12llu misses",
hart->cache_store.hits, hart->cache_store.misses);
fprintf(stderr, " Store: %12llu hits, %12llu misses (8x2)", store_hits,
store_misses);
if (store_total > 0)
fprintf(stderr, " (%.2f%% hit rate)",
100.0 * hart->cache_store.hits / store_total);
100.0 * store_hits / store_total);
fprintf(stderr, "\n");
}
}
Expand Down Expand Up @@ -1007,6 +1029,13 @@ static int semu_run(emu_state_t *emu)
#endif

while (!emu->stopped) {
#ifdef MMU_CACHE_STATS
/* Check if signal received (SIGINT/SIGTERM) */
if (signal_received) {
print_mmu_cache_stats(&emu->vm);
return 0;
}
#endif
/* Resume each hart's coroutine in round-robin fashion */
for (uint32_t i = 0; i < vm->n_hart; i++) {
coro_resume_hart(i);
Expand Down Expand Up @@ -1100,6 +1129,11 @@ static int semu_run(emu_state_t *emu)
if (ret)
return ret;
#ifdef MMU_CACHE_STATS
/* Check if signal received (SIGINT/SIGTERM) */
if (signal_received) {
print_mmu_cache_stats(&emu->vm);
return 0;
}
/* Exit after running for 15 seconds to collect statistics */
gettimeofday(&current_time, NULL);
long elapsed_sec = current_time.tv_sec - start_time.tv_sec;
Expand Down Expand Up @@ -1246,6 +1280,12 @@ int main(int argc, char **argv)
if (ret)
return ret;

#ifdef MMU_CACHE_STATS
global_vm_for_signal = &emu.vm;
signal(SIGINT, signal_handler_stats);
signal(SIGTERM, signal_handler_stats);
#endif

if (emu.debug)
ret = semu_run_debug(&emu);
else
Expand Down
105 changes: 76 additions & 29 deletions riscv.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,18 @@ static inline uint32_t read_rs2(const hart_t *vm, uint32_t insn)
void mmu_invalidate(hart_t *vm)
{
vm->cache_fetch.n_pages = 0xFFFFFFFF;
vm->cache_load[0].n_pages = 0xFFFFFFFF;
vm->cache_load[1].n_pages = 0xFFFFFFFF;
vm->cache_store.n_pages = 0xFFFFFFFF;
/* Invalidate all 8 sets × 2 ways for load cache */
for (int set = 0; set < 8; set++) {
for (int way = 0; way < 2; way++)
vm->cache_load[set].ways[way].n_pages = 0xFFFFFFFF;
vm->cache_load[set].lru = 0; /* Reset LRU to way 0 */
}
/* Invalidate all 8 sets × 2 ways for store cache */
for (int set = 0; set < 8; set++) {
for (int way = 0; way < 2; way++)
vm->cache_store[set].ways[way].n_pages = 0xFFFFFFFF;
vm->cache_store[set].lru = 0; /* Reset LRU to way 0 */
}
}

/* Pre-verify the root page table to minimize page table access during
Expand Down Expand Up @@ -333,13 +342,36 @@ static void mmu_load(hart_t *vm,
{
uint32_t vpn = addr >> RV_PAGE_SHIFT;
uint32_t phys_addr;
/* 2-entry direct-mapped cache: use parity hash to select entry */
uint32_t index = __builtin_parity(vpn) & 0x1;
/* 8-set × 2-way set-associative cache: use 3-bit parity hash */
uint32_t set_idx = (__builtin_parity(vpn & 0xAAAAAAAA) << 2) |
(__builtin_parity(vpn & 0x55555555) << 1) |
__builtin_parity(vpn & 0xCCCCCCCC);

mmu_cache_set_t *set = &vm->cache_load[set_idx];

/* Check both ways in the set */
int hit_way = -1;
for (int way = 0; way < 2; way++) {
if (likely(set->ways[way].n_pages == vpn)) {
hit_way = way;
break;
}
}

if (unlikely(vpn != vm->cache_load[index].n_pages)) {
if (likely(hit_way >= 0)) {
/* Cache hit: reconstruct physical address from cached PPN */
#ifdef MMU_CACHE_STATS
set->ways[hit_way].hits++;
#endif
phys_addr = (set->ways[hit_way].phys_ppn << RV_PAGE_SHIFT) |
(addr & MASK(RV_PAGE_SHIFT));
/* Update LRU: mark the other way as replacement candidate */
set->lru = 1 - hit_way;
} else {
/* Cache miss: do full translation */
int victim_way = set->lru; /* Use LRU bit to select victim */
#ifdef MMU_CACHE_STATS
vm->cache_load[index].misses++;
set->ways[victim_way].misses++;
#endif
phys_addr = addr;
mmu_translate(vm, &phys_addr,
Expand All @@ -348,16 +380,11 @@ static void mmu_load(hart_t *vm,
RV_EXC_LOAD_PFAULT);
if (vm->error)
return;
/* Cache physical page number (not a pointer) */
vm->cache_load[index].n_pages = vpn;
vm->cache_load[index].phys_ppn = phys_addr >> RV_PAGE_SHIFT;
} else {
/* Cache hit: reconstruct physical address from cached PPN */
#ifdef MMU_CACHE_STATS
vm->cache_load[index].hits++;
#endif
phys_addr = (vm->cache_load[index].phys_ppn << RV_PAGE_SHIFT) |
(addr & MASK(RV_PAGE_SHIFT));
/* Replace victim way with new translation */
set->ways[victim_way].n_pages = vpn;
set->ways[victim_way].phys_ppn = phys_addr >> RV_PAGE_SHIFT;
/* Update LRU: mark the other way for next eviction */
set->lru = 1 - victim_way;
}

vm->mem_load(vm, phys_addr, width, value);
Expand All @@ -376,28 +403,48 @@ static bool mmu_store(hart_t *vm,
{
uint32_t vpn = addr >> RV_PAGE_SHIFT;
uint32_t phys_addr;
/* 8-set × 2-way set-associative cache: use 3-bit parity hash */
uint32_t set_idx = (__builtin_parity(vpn & 0xAAAAAAAA) << 2) |
(__builtin_parity(vpn & 0x55555555) << 1) |
__builtin_parity(vpn & 0xCCCCCCCC);

mmu_cache_set_t *set = &vm->cache_store[set_idx];

/* Check both ways in the set */
int hit_way = -1;
for (int way = 0; way < 2; way++) {
if (likely(set->ways[way].n_pages == vpn)) {
hit_way = way;
break;
}
}

if (unlikely(vpn != vm->cache_store.n_pages)) {
if (likely(hit_way >= 0)) {
/* Cache hit: reconstruct physical address from cached PPN */
#ifdef MMU_CACHE_STATS
set->ways[hit_way].hits++;
#endif
phys_addr = (set->ways[hit_way].phys_ppn << RV_PAGE_SHIFT) |
(addr & MASK(RV_PAGE_SHIFT));
/* Update LRU: mark the other way as replacement candidate */
set->lru = 1 - hit_way;
} else {
/* Cache miss: do full translation */
int victim_way = set->lru; /* Use LRU bit to select victim */
#ifdef MMU_CACHE_STATS
vm->cache_store.misses++;
set->ways[victim_way].misses++;
#endif
phys_addr = addr;
mmu_translate(vm, &phys_addr, (1 << 2), (1 << 6) | (1 << 7),
vm->sstatus_sum && vm->s_mode, RV_EXC_STORE_FAULT,
RV_EXC_STORE_PFAULT);
if (vm->error)
return false;
/* Cache physical page number (not a pointer) */
vm->cache_store.n_pages = vpn;
vm->cache_store.phys_ppn = phys_addr >> RV_PAGE_SHIFT;
} else {
/* Cache hit: reconstruct physical address from cached PPN */
#ifdef MMU_CACHE_STATS
vm->cache_store.hits++;
#endif
phys_addr = (vm->cache_store.phys_ppn << RV_PAGE_SHIFT) |
(addr & MASK(RV_PAGE_SHIFT));
/* Replace victim way with new translation */
set->ways[victim_way].n_pages = vpn;
set->ways[victim_way].phys_ppn = phys_addr >> RV_PAGE_SHIFT;
/* Update LRU: mark the other way for next eviction */
set->lru = 1 - victim_way;
}

if (unlikely(cond)) {
Expand Down
13 changes: 10 additions & 3 deletions riscv.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ typedef struct {
#endif
} mmu_addr_cache_t;

/* Set-associative cache structure for load operations */
typedef struct {
mmu_addr_cache_t ways[2]; /* 2-way associative */
uint8_t lru; /* LRU bit: 0 or 1 (which way to replace) */
} mmu_cache_set_t;

/* To use the emulator, start by initializing a hart_t object with zero values,
* invoke vm_init(), and set the required environment-supplied callbacks. You
* may also set other necessary fields such as argument registers and s_mode,
Expand Down Expand Up @@ -101,9 +107,10 @@ struct __hart_internal {
uint32_t exc_cause, exc_val;

mmu_fetch_cache_t cache_fetch;
/* 2-entry direct-mapped with hash-based indexing */
mmu_addr_cache_t cache_load[2];
mmu_addr_cache_t cache_store;
/* 8-set × 2-way set-associative cache with 3-bit parity hash indexing */
mmu_cache_set_t cache_load[8];
/* 8-set × 2-way set-associative cache for store operations */
mmu_cache_set_t cache_store[8];

/* Supervisor state */
bool s_mode;
Expand Down
57 changes: 53 additions & 4 deletions utils.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include <math.h>
#include <stdbool.h>
#include <stdio.h>
#include <time.h>

#include "utils.h"
Expand All @@ -24,6 +26,10 @@ bool boot_complete = false;
static double ticks_increment;
static double boot_ticks;

/* Timer calibration statistics */
static uint64_t timer_call_count = 0;
static int timer_n_harts = 1;

/* Calculate "x * n / d" without unnecessary overflow or loss of precision.
*
* Reference:
Expand Down Expand Up @@ -88,6 +94,7 @@ static uint64_t semu_timer_clocksource(semu_timer_t *timer)
static bool first_switch = true;

if (!boot_complete) {
timer_call_count++;
boot_ticks += ticks_increment;
return (uint64_t) boot_ticks;
}
Expand All @@ -98,6 +105,34 @@ static uint64_t semu_timer_clocksource(semu_timer_t *timer)

/* Calculate the offset between the real time and the emulator time */
offset = (int64_t) (real_ticks - boot_ticks);

#ifdef SEMU_TIMER_STATS
/* Output timer calibration statistics (only when SEMU_TIMER_STATS is
* defined) */
double actual_coefficient = (double) timer_call_count / timer_n_harts;
double current_coefficient = 1.744e8;
double recommended_coefficient = actual_coefficient;

fprintf(stderr, "\n[Timer Calibration Statistics]\n");
fprintf(stderr, " Boot completed after %llu timer calls\n",
(unsigned long long) timer_call_count);
fprintf(stderr, " Number of harts: %d\n", timer_n_harts);
fprintf(stderr, " Actual coefficient: %.3e (%.2f calls per hart)\n",
actual_coefficient, actual_coefficient);
fprintf(stderr, " Current coefficient: %.3e\n", current_coefficient);
fprintf(stderr, " Difference: %.2f%% %s\n",
fabs(actual_coefficient - current_coefficient) /
current_coefficient * 100.0,
actual_coefficient > current_coefficient ? "(more calls)"
: "(fewer calls)");
fprintf(stderr, "\n[Recommendation]\n");
fprintf(stderr, " Update utils.c line 121 to:\n");
fprintf(stderr,
" ticks_increment = (SEMU_BOOT_TARGET_TIME * CLOCK_FREQ) / "
"(%.3e * n_harts);\n",
recommended_coefficient);
fprintf(stderr, "\n");
#endif
}
return (uint64_t) ((int64_t) real_ticks - offset);
}
Expand All @@ -108,14 +143,28 @@ void semu_timer_init(semu_timer_t *timer, uint64_t freq, int n_harts)
timer->begin = mult_frac(host_time_ns(), timer->freq, 1e9);
boot_ticks = timer->begin; /* Initialize the fake ticks for boot process */

/* Store n_harts for calibration statistics */
timer_n_harts = n_harts;

/* According to statistics, the number of times 'semu_timer_clocksource'
* called is approximately 'SMP count * 2.15 * 1e8'. By the time the boot
* called is approximately 'SMP count * 1.744 * 1e8'. By the time the boot
* process is completed, the emulator will have a total of 'boot seconds *
* frequency' ticks. Therefore, each time, '(boot seconds * frequency) /
* (2.15 * 1e8 * SMP count)' ticks need to be added.
* frequency' ticks. Therefore, each time, (boot seconds * frequency) /
* (1.744 * 1e8 * SMP count) ticks need to be added.
*
* Note: This coefficient was recalibrated after MMU cache optimization
* (8×2 set-associative with 99%+ hit rate). The original coefficient
* (2.15 * 1e8) was based on measurements before the optimization. With
* faster CPU execution, fewer timer calls are needed to complete boot.
*
* Calibration history:
* - Original (pre-MMU cache): 2.15 × 10^8
* - After MMU cache (measured): 1.696 × 10^8 (-21.1%)
* - Verification measurement: 1.744 × 10^8 (error: 2.85%)
* - Final coefficient: 1.744 × 10^8 (based on verification)
*/
ticks_increment =
(SEMU_BOOT_TARGET_TIME * CLOCK_FREQ) / (2.15 * 1e8 * n_harts);
(SEMU_BOOT_TARGET_TIME * CLOCK_FREQ) / (1.744 * 1e8 * n_harts);
}

uint64_t semu_timer_get(semu_timer_t *timer)
Expand Down
Loading