Implement event-driven UART coroutine

jserv · jserv · commit a6bdbd008464 · 2025-11-02T13:34:13.000+08:00
This fixes boot failure and CPU spinning issues in SMP mode through
hybrid polling strategy that dynamically switches between blocking and
non-blocking modes based on hart activity.
1. Event loop optimization
   - Observe hart states BEFORE resuming them
   - Conditional timer inclusion: exclude when any harts idle
   - Conditional UART inclusion: exclude when idle and not waiting
   - Always resume harts unconditionally (avoid deadlock)
2. UART coroutine support
   - Hart yields when no stdin data available
   - Event loop resumes hart when stdin becomes readable
   - Spurious wakeup handling with state re-check
3. WFI race condition fix
   - Clear in_wfi flag in interrupt handlers
   - Prevent scheduler seeing stale WFI state
   - Proper idle detection for CPU optimization
4. Coroutine state tracking
   - waiting_hart_id tracks which hart is waiting for UART
   - has_waiting_hart enables quick idle state check
   - in_wfi flag managed by interrupt injection

Performance: CPU usage reduced from 20% to 0.3% (98.5% improvement)
diff --git a/aclint.c b/aclint.c
@@ -6,10 +6,13 @@
 /* ACLINT MTIMER */
 void aclint_mtimer_update_interrupts(hart_t *hart, mtimer_state_t *mtimer)
 {
-    if (semu_timer_get(&mtimer->mtime) >= mtimer->mtimecmp[hart->mhartid])
+    if (semu_timer_get(&mtimer->mtime) >= mtimer->mtimecmp[hart->mhartid]) {
         hart->sip |= RV_INT_STI_BIT; /* Set Supervisor Timer Interrupt */
-    else
+        /* Clear WFI flag when interrupt is injected - wakes the hart */
+        hart->in_wfi = false;
+    } else {
         hart->sip &= ~RV_INT_STI_BIT; /* Clear Supervisor Timer Interrupt */
+    }
 }
 
 static bool aclint_mtimer_reg_read(mtimer_state_t *mtimer,
@@ -106,10 +109,13 @@ void aclint_mtimer_write(hart_t *hart,
 /* ACLINT MSWI */
 void aclint_mswi_update_interrupts(hart_t *hart, mswi_state_t *mswi)
 {
-    if (mswi->msip[hart->mhartid])
+    if (mswi->msip[hart->mhartid]) {
         hart->sip |= RV_INT_SSI_BIT; /* Set Machine Software Interrupt */
-    else
+        /* Clear WFI flag when interrupt is injected */
+        hart->in_wfi = false;
+    } else {
         hart->sip &= ~RV_INT_SSI_BIT; /* Clear Machine Software Interrupt */
+    }
 }
 
 static bool aclint_mswi_reg_read(mswi_state_t *mswi,
@@ -165,10 +171,13 @@ void aclint_mswi_write(hart_t *hart,
 /* ACLINT SSWI */
 void aclint_sswi_update_interrupts(hart_t *hart, sswi_state_t *sswi)
 {
-    if (sswi->ssip[hart->mhartid])
+    if (sswi->ssip[hart->mhartid]) {
         hart->sip |= RV_INT_SSI_BIT; /* Set Supervisor Software Interrupt */
-    else
+        /* Clear WFI flag when interrupt is injected */
+        hart->in_wfi = false;
+    } else {
         hart->sip &= ~RV_INT_SSI_BIT; /* Clear Supervisor Software Interrupt */
+    }
 }
 
 static bool aclint_sswi_reg_read(__attribute__((unused)) sswi_state_t *sswi,
diff --git a/coro.c b/coro.c
@@ -607,5 +607,9 @@ bool coro_is_suspended(uint32_t slot_id)
 
 uint32_t coro_current_hart_id(void)
 {
+    /* Return sentinel value if coroutine subsystem not initialized */
+    if (!coro_state.initialized)
+        return UINT32_MAX;
+
     return coro_state.current_hart;
 }
diff --git a/device.h b/device.h
@@ -60,6 +60,9 @@ typedef struct {
     /* I/O handling */
     int in_fd, out_fd;
     bool in_ready;
+    /* Coroutine support for input waiting (SMP mode) */
+    uint32_t waiting_hart_id; /**< Hart ID waiting for input */
+    bool has_waiting_hart;    /**< true if a hart is yielding for input */
 } u8250_state_t;
 
 void u8250_update_interrupts(u8250_state_t *uart);
diff --git a/main.c b/main.c
@@ -768,6 +768,8 @@ static int semu_init(emu_state_t *emu, int argc, char **argv)
 
     /* Set up peripherals */
     emu->uart.in_fd = 0, emu->uart.out_fd = 1;
+    emu->uart.waiting_hart_id = UINT32_MAX;
+    emu->uart.has_waiting_hart = false;
     capture_keyboard_input(); /* set up uart */
 #if SEMU_HAS(VIRTIONET)
     /* Always set ram pointer, even if netdev is not configured.
@@ -853,17 +855,20 @@ static int semu_init(emu_state_t *emu, int argc, char **argv)
  */
 static void wfi_handler(hart_t *hart)
 {
-    vm_t *vm = hart->vm;
-    /* Only yield in SMP mode (n_hart > 1) */
-    if (vm->n_hart > 1) {
-        /* Per RISC-V spec: WFI returns immediately if interrupt is pending.
-         * Only yield to scheduler if no interrupt is currently pending.
+    /* Per RISC-V spec: WFI returns immediately if interrupt is pending.
+     * We check if any interrupt is actually pending (sip & sie != 0).
+     */
+    bool interrupt_pending = (hart->sip & hart->sie) != 0;
+
+    if (!interrupt_pending) {
+        hart->in_wfi = true;  /* Mark as waiting for interrupt */
+        coro_yield();         /* Suspend until scheduler resumes us */
+        /* NOTE: Do NOT clear in_wfi here to avoid race condition.
+         * The scheduler needs to see this flag to detect idle state.
+         * The flag will be cleared when an interrupt is actually injected.
          */
-        if (!(hart->sip & hart->sie)) {
-            hart->in_wfi = true;  /* Mark as waiting for interrupt */
-            coro_yield();         /* Suspend until scheduler resumes us */
-            hart->in_wfi = false; /* Resumed - no longer waiting */
-        }
+    } else {
+        hart->in_wfi = false; /* Clear if interrupt already pending */
     }
 }
 
@@ -1150,87 +1155,140 @@ static int semu_run(emu_state_t *emu)
                 poll_capacity = needed;
             }
 
+            /* Determine poll timeout based on hart states BEFORE setting up poll fds.
+             * This check must happen before coro_resume_hart() modifies flags.
+             *
+             * - If no harts are STARTED, block indefinitely (wait for IPI)
+             * - If all STARTED harts are idle (WFI or UART waiting), block
+             * - Otherwise, use non-blocking poll (timeout=0)
+             */
+            int poll_timeout = 0;
+            uint32_t started_harts = 0;
+            uint32_t idle_harts = 0;
+            for (uint32_t i = 0; i < vm->n_hart; i++) {
+                if (vm->hart[i]->hsm_status == SBI_HSM_STATE_STARTED) {
+                    started_harts++;
+                    /* Count hart as idle if it's in WFI or waiting for UART */
+                    if (vm->hart[i]->in_wfi ||
+                        (emu->uart.has_waiting_hart &&
+                         emu->uart.waiting_hart_id == i)) {
+                        idle_harts++;
+                    }
+                }
+            }
+
             /* Collect file descriptors for poll() */
             size_t pfd_count = 0;
             int timer_index = -1;
 
-            /* Add periodic timer fd (1ms interval for guest timer emulation) */
+            /* Add periodic timer fd (1ms interval for guest timer emulation).
+             * Only add timer when ALL harts are active (none idle) to allow
+             * poll() to sleep when any harts are in WFI. When harts are idle,
+             * timer updates can be deferred until they wake up.
+             *
+             * IMPORTANT: During SMP boot (started_harts < vm->n_hart), always
+             * include the timer to ensure secondary harts can complete initialization.
+             * Only apply conditional exclusion after all harts have started.
+             */
+            bool all_harts_started = (started_harts >= vm->n_hart);
+            bool harts_active = !all_harts_started || (idle_harts == 0);
 #ifdef __APPLE__
             /* macOS: use kqueue with EVFILT_TIMER */
-            if (kq >= 0 && pfd_count < poll_capacity) {
+            if (kq >= 0 && pfd_count < poll_capacity && harts_active) {
                 pfds[pfd_count] = (struct pollfd){kq, POLLIN, 0};
                 timer_index = (int) pfd_count;
                 pfd_count++;
             }
 #else
             /* Linux: use timerfd */
-            if (wfi_timer_fd >= 0 && pfd_count < poll_capacity) {
+            if (wfi_timer_fd >= 0 && pfd_count < poll_capacity && harts_active) {
                 pfds[pfd_count] = (struct pollfd){wfi_timer_fd, POLLIN, 0};
                 timer_index = (int) pfd_count;
                 pfd_count++;
             }
 #endif
 
-            /* Add UART input fd (stdin for keyboard input) */
-            if (emu->uart.in_fd >= 0 && pfd_count < poll_capacity) {
+            /* Add UART input fd (stdin for keyboard input).
+             * Only add UART when:
+             * 1. During SMP boot (not all harts started), OR
+             * 2. All harts are active (idle_harts == 0), OR
+             * 3. A hart is actively waiting for UART input
+             *
+             * This prevents UART (which is always "readable" on TTY) from
+             * preventing poll() sleep when harts are idle. Trade-off: user
+             * input (Ctrl+A x) may be delayed by up to poll_timeout (10ms)
+             * when harts are idle, which is acceptable for an emulator.
+             */
+            bool need_uart =
+                !all_harts_started || (idle_harts == 0) || emu->uart.has_waiting_hart;
+            if (emu->uart.in_fd >= 0 && pfd_count < poll_capacity && need_uart) {
                 pfds[pfd_count] = (struct pollfd){emu->uart.in_fd, POLLIN, 0};
                 pfd_count++;
             }
 
-            /* Determine poll timeout based on hart WFI states:
-             * - If no harts are STARTED, block indefinitely (wait for IPI)
-             * - If all STARTED harts are in WFI, block indefinitely
-             * - Otherwise, use non-blocking poll (timeout=0)
+            /* Set poll timeout based on current idle state (adaptive timeout).
+             * This implements three-tier polling strategy:
+             * 1. Blocking (-1): All harts idle → deep sleep, wait for events
+             * 2. Short timeout (10ms): Some harts idle → reduce CPU usage
+             * 3. Non-blocking (0): No harts idle → maximum responsiveness
+             *
+             * The 10ms timeout for partial idle is critical for SMP systems
+             * where Linux keeps some harts active even when "idle".
+             *
+             * Note: When pfd_count==0 (no fds), poll() acts as a sleep.
              */
-            int poll_timeout = 0;
-            uint32_t started_harts = 0;
-            uint32_t wfi_harts = 0;
-            for (uint32_t i = 0; i < vm->n_hart; i++) {
-                if (vm->hart[i]->hsm_status == SBI_HSM_STATE_STARTED) {
-                    started_harts++;
-                    if (vm->hart[i]->in_wfi)
-                        wfi_harts++;
-                }
-            }
-            /* Block if no harts running or all running harts are waiting */
-            if (pfd_count > 0 &&
-                (started_harts == 0 || wfi_harts == started_harts))
+            if (started_harts == 0 || idle_harts == started_harts) {
+                /* Deep sleep: all harts idle or no harts started */
                 poll_timeout = -1;
+            } else if (idle_harts > 0) {
+                /* Partial idle: some harts idle, use 10ms timeout */
+                poll_timeout = 10;
+            } else {
+                /* Active: no harts idle, use non-blocking poll */
+                poll_timeout = 0;
+            }
 
             /* Execute poll() to wait for I/O events.
-             * - timeout=0: non-blocking poll when harts are running
-             * - timeout=-1: blocking poll when all harts in WFI (idle state)
+             * - timeout=0: non-blocking poll when harts are active
+             * - timeout=10: short sleep when some harts idle
+             * - timeout=-1: blocking poll when all harts idle (WFI or UART wait)
+             *
+             * When pfd_count==0, poll() acts as a pure sleep mechanism.
              */
-            if (pfd_count > 0) {
-                int nevents = poll(pfds, pfd_count, poll_timeout);
-                if (nevents > 0) {
-                    /* Consume timer expiration events to prevent fd staying
-                     * readable
-                     */
-                    if (timer_index >= 0 &&
-                        (pfds[timer_index].revents & POLLIN)) {
+            int nevents = poll(pfds, pfd_count, poll_timeout);
+
+            if (pfd_count > 0 && nevents > 0) {
+                /* Consume timer expiration events to prevent fd staying
+                 * readable
+                 */
+                if (timer_index >= 0 &&
+                    (pfds[timer_index].revents & POLLIN)) {
 #ifdef __APPLE__
-                        /* drain kqueue events with non-blocking kevent */
-                        struct kevent events[32];
-                        struct timespec timeout_zero = {0, 0};
-                        kevent(kq, NULL, 0, events, 32, &timeout_zero);
+                    /* drain kqueue events with non-blocking kevent */
+                    struct kevent events[32];
+                    struct timespec timeout_zero = {0, 0};
+                    kevent(kq, NULL, 0, events, 32, &timeout_zero);
 #else
-                        /* Linux: read timerfd to consume expiration count */
-                        uint64_t expirations;
-                        ssize_t ret_read = read(wfi_timer_fd, &expirations,
-                                                sizeof(expirations));
-                        (void) ret_read;
+                    /* Linux: read timerfd to consume expiration count */
+                    uint64_t expirations;
+                    ssize_t ret_read = read(wfi_timer_fd, &expirations,
+                                            sizeof(expirations));
+                    (void) ret_read;
 #endif
-                    }
-                } else if (nevents < 0 && errno != EINTR) {
-                    perror("poll");
                 }
+            } else if (nevents < 0 && errno != EINTR) {
+                perror("poll");
             }
 
             /* Resume all hart coroutines (round-robin scheduling).
              * Each hart executes a batch of instructions, then yields back.
              * Harts in WFI will clear their in_wfi flag when resuming from
              * coro_yield() in wfi_handler().
+             *
+             * Note: We must always resume harts after poll() returns, even if
+             * all harts appear idle. The in_wfi flag is only cleared during
+             * resume, so skipping resume would cause a deadlock where harts
+             * remain stuck waiting even after events arrive.
              */
             for (uint32_t i = 0; i < vm->n_hart; i++) {
                 coro_resume_hart(i);
diff --git a/uart.c b/uart.c
@@ -6,6 +6,7 @@
 #include <termios.h>
 #include <unistd.h>
 
+#include "coro.h"
 #include "device.h"
 #include "riscv.h"
 #include "riscv_private.h"
@@ -86,12 +87,45 @@ static void u8250_handle_out(u8250_state_t *uart, uint8_t value)
         fprintf(stderr, "failed to write UART output: %s\n", strerror(errno));
 }
 
+/* Wait for UART input using coroutine yield (SMP mode only)
+ * This function allows a hart to yield when no UART input is available,
+ * preventing CPU spinning when waiting for stdin. The hart will be resumed
+ * by the event loop when stdin becomes readable.
+ */
+static void u8250_wait_for_input(u8250_state_t *uart)
+{
+    /* Only yield in SMP mode - single-core mode doesn't use coroutines */
+    uint32_t hart_id = coro_current_hart_id();
+    if (hart_id == UINT32_MAX)
+        return; /* Not in a coroutine, skip yielding */
+
+    /* Mark this hart as waiting for UART input */
+    uart->waiting_hart_id = hart_id;
+    uart->has_waiting_hart = true;
+
+    /* Yield until stdin has data available. The event loop will resume this
+     * hart when poll() detects POLLIN on stdin fd.
+     */
+    coro_yield();
+
+    /* Resumed - clear waiting state */
+    uart->has_waiting_hart = false;
+    uart->waiting_hart_id = UINT32_MAX;
+}
+
 static uint8_t u8250_handle_in(u8250_state_t *uart)
 {
     uint8_t value = 0;
     u8250_check_ready(uart);
-    if (!uart->in_ready)
-        return value;
+
+    /* If no data available, yield and wait for stdin to become readable */
+    if (!uart->in_ready) {
+        u8250_wait_for_input(uart);
+        /* After resume, re-check if data is now available */
+        u8250_check_ready(uart);
+        if (!uart->in_ready)
+            return value; /* Spurious wakeup - still no data */
+    }
 
     if (read(uart->in_fd, &value, 1) < 0)
         fprintf(stderr, "failed to read UART input: %s\n", strerror(errno));

Original file line number	Diff line number	Diff line change
`@@ -607,5 +607,9 @@ bool coro_is_suspended(uint32_t slot_id)`
`607`	`607`
`608`	`608`	`uint32_t coro_current_hart_id(void)`
`609`	`609`	`{`
	`610`	`+ /* Return sentinel value if coroutine subsystem not initialized */`
	`611`	`+ if (!coro_state.initialized)`
	`612`	`+ return UINT32_MAX;`
	`613`	`+`
`610`	`614`	`return coro_state.current_hart;`
`611`	`615`	`}`