From 67bac34bfde9ea9ac164288918784d4f85674e3f Mon Sep 17 00:00:00 2001
From: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Date: Fri, 31 Jan 2025 09:55:35 -0800
Subject: [PATCH] FDT dedup log sync  -- remove incremental

This PR condenses the FDT dedup log syncing into a single sync
pass. This reduces the overhead of modifying indirect blocks for the
dedup table multiple times per txg. In addition, changes were made to
the formula for how much to sync per txg. We now also consider the
backlog we have to clear, to prevent it from growing too large, or
remaining large on an idle system.

Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.

Authored-by: Don Brady <don.brady@klarasystems.com>
Authored-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
Signed-off-by: Paul Dagnelie <paul.dagnelie@klarasystems.com>
---
 include/sys/ddt.h                             |   7 +-
 include/sys/vdev.h                            |   1 +
 include/sys/zfs_debug.h                       |   1 +
 man/man4/zfs.4                                |  83 +++--
 module/zfs/ddt.c                              | 342 +++++++++---------
 module/zfs/vdev_queue.c                       |  10 +
 tests/runfiles/common.run                     |   4 +-
 tests/zfs-tests/include/tunables.cfg          |   2 +
 tests/zfs-tests/tests/Makefile.am             |   1 +
 .../functional/dedup/dedup_fdt_pacing.ksh     | 109 ++++++
 .../tests/functional/dedup/dedup_prune.ksh    |   4 +-
 .../tests/functional/dedup/dedup_quota.ksh    |   3 +
 .../functional/dedup/dedup_zap_shrink.ksh     |   3 +
 13 files changed, 367 insertions(+), 203 deletions(-)
 create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh

diff --git a/include/sys/ddt.h b/include/sys/ddt.h
index 4e5ccd46318e..17675c2e28eb 100644
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -285,14 +285,11 @@ typedef struct {
 	ddt_log_t	*ddt_log_active;	/* pointers into ddt_log */
 	ddt_log_t	*ddt_log_flushing;	/* swapped when flush starts */
 
-	hrtime_t	ddt_flush_start;	/* log flush start this txg */
-	uint32_t	ddt_flush_pass;		/* log flush pass this txg */
-
-	int32_t		ddt_flush_count;	/* entries flushed this txg */
-	int32_t		ddt_flush_min;		/* min rem entries to flush */
 	int32_t		ddt_log_ingest_rate;	/* rolling log ingest rate */
 	int32_t		ddt_log_flush_rate;	/* rolling log flush rate */
 	int32_t		ddt_log_flush_time_rate; /* avg time spent flushing */
+	uint32_t	ddt_log_flush_pressure;	/* pressure to apply for cap */
+	uint32_t	ddt_log_flush_prev_backlog; /* prev backlog size */
 
 	uint64_t	ddt_flush_force_txg;	/* flush hard before this txg */
 
diff --git a/include/sys/vdev.h b/include/sys/vdev.h
index 6ab7ac40bb07..241b1ec82ef1 100644
--- a/include/sys/vdev.h
+++ b/include/sys/vdev.h
@@ -171,6 +171,7 @@ extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
 extern uint32_t vdev_queue_length(vdev_t *vd);
 extern uint64_t vdev_queue_last_offset(vdev_t *vd);
 extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
+extern boolean_t vdev_queue_pool_busy(spa_t *spa);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h
index e509c8b7c638..428563a91263 100644
--- a/include/sys/zfs_debug.h
+++ b/include/sys/zfs_debug.h
@@ -59,6 +59,7 @@ extern int zfs_dbgmsg_enable;
 #define	ZFS_DEBUG_METASLAB_ALLOC	(1 << 13)
 #define	ZFS_DEBUG_BRT			(1 << 14)
 #define	ZFS_DEBUG_RAIDZ_RECONSTRUCT	(1 << 15)
+#define	ZFS_DEBUG_DDT			(1 << 16)
 
 extern void __set_error(const char *file, const char *func, int line, int err);
 extern void __zfs_dbgmsg(char *buf);
diff --git a/man/man4/zfs.4 b/man/man4/zfs.4
index 9d83357fcc6d..5db154942813 100644
--- a/man/man4/zfs.4
+++ b/man/man4/zfs.4
@@ -1026,27 +1026,6 @@ milliseconds until the operation completes.
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
-.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint
-Maximum number of dedup log flush passes (iterations) each transaction.
-.Pp
-At the start of each transaction, OpenZFS will estimate how many entries it
-needs to flush out to keep up with the change rate, taking the amount and time
-taken to flush on previous txgs into account (see
-.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
-It will spread this amount into a number of passes.
-At each pass, it will use the amount already flushed and the total time taken
-by flushing and by other IO to recompute how much it should do for the remainder
-of the txg.
-.Pp
-Reducing the max number of passes will make flushing more aggressive, flushing
-out more entries on each pass.
-This can be faster, but also more likely to compete with other IO.
-Increasing the max number of passes will put fewer entries onto each pass,
-keeping the overhead of dedup changes to a minimum but possibly causing a large
-number of changes to be dumped on the last pass, which can blow out the txg
-sync time beyond
-.Sy zfs_txg_timeout .
-.
 .It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
 Minimum time to spend on dedup log flush each transaction.
 .Pp
@@ -1056,22 +1035,58 @@ up to
 This occurs even if doing so would delay the transaction, that is, other IO
 completes under this time.
 .
-.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint
+.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 100 Ns Pq uint
 Flush at least this many entries each transaction.
 .Pp
-OpenZFS will estimate how many entries it needs to flush each transaction to
-keep up with the ingest rate (see
-.Sy zfs_dedup_log_flush_flow_rate_txgs ) .
-This sets the minimum for that estimate.
-Raising it can force OpenZFS to flush more aggressively, keeping the log small
-and so reducing pool import times, but can make it less able to back off if
-log flushing would compete with other IO too much.
-.
+OpenZFS will flush a fraction of the log every TXG, to keep the size
+proportional to the ingest rate (see
+.Sy zfs_dedup_log_flush_txgs ) .
+This sets the minimum for that estimate, which prevents the backlog from
+completely draining if the ingest rate falls.
+Raising it can force OpenZFS to flush more aggressively, reducing the backlog
+to zero more quickly, but can make it less able to back off if log
+flushing would compete with other IO too much.
+.
+.It Sy zfs_dedup_log_flush_entries_max Ns = Ns Sy UINT_MAX Ns Pq uint
+Flush at most this many entries each transaction.
+.Pp
+Mostly used for debugging purposes.
+.It Sy zfs_dedup_log_flush_txgs Ns = Ns Sy 100 Ns Pq uint
+Target number of TXGs to process the whole dedup log.
+.Pp
+Every TXG, OpenZFS will process the inverse of this number times the size
+of the DDT backlog.
+This will keep the backlog at a size roughly equal to the ingest rate
+times this value.
+This offers a balance between a more efficient DDT log, with better
+aggregation, and shorter import times, which increase as the size of the
+DDT log increases.
+Increasing this value will result in a more efficient DDT log, but longer
+import times.
+.It Sy zfs_dedup_log_cap Ns = Ns Sy UINT_MAX Ns Pq uint
+Soft cap for the size of the current dedup log.
+.Pp
+If the log is larger than this size, we increase the aggressiveness of
+the flushing to try to bring it back down to the soft cap.
+Setting it will reduce import times, but will reduce the efficiency of
+the DDT log, increasing the expected number of IOs required to flush the same
+amount of data.
+.It Sy zfs_dedup_log_hard_cap Ns = Ns Sy 0 Ns | Ns 1 Pq uint
+Whether to treat the log cap as a firm cap or not.
+.Pp
+When set to 0 (the default), the
+.Sy zfs_dedup_log_cap
+will increase the maximum number of log entries we flush in a given txg.
+This will bring the backlog size down towards the cap, but not at the expense
+of making TXG syncs take longer.
+If this is set to 1, the cap acts more like a hard cap than a soft cap; it will
+also increase the minimum number of log entries we flush per TXG.
+Enabling it will reduce worst-case import times, at the cost of increased TXG
+sync times.
 .It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
 Number of transactions to use to compute the flow rate.
 .Pp
-OpenZFS will estimate how many entries it needs to flush each transaction by
-monitoring the number of entries changed (ingest rate), number of entries
+OpenZFS will estimate number of entries changed (ingest rate), number of entries
 flushed (flush rate) and time spent flushing (flush time rate) and combining
 these into an overall "flow rate".
 It will use an exponential weighted moving average over some number of recent
@@ -1607,6 +1622,10 @@ _
 	2048	ZFS_DEBUG_TRIM	Verify TRIM ranges are always within the allocatable range tree.
 	4096	ZFS_DEBUG_LOG_SPACEMAP	Verify that the log summary is consistent with the spacemap log
 			       and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing.
+	8192	ZFS_DEBUG_METASLAB_ALLOC	Enable debugging messages when allocations fail.
+	16384	ZFS_DEBUG_BRT	Enable BRT-related debugging messages.
+	32768	ZFS_DEBUG_RAIDZ_RECONSTRUCT	Enabled debugging messages for raidz reconstruction.
+	65536	ZFS_DEBUG_DDT	Enable DDT-related debugging messages.
 .TE
 .Sy \& * No Requires debug build .
 .
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index e8b9fb498d4b..40b42e12f0d2 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -249,11 +249,6 @@ static uint32_t zfs_ddt_prunes_per_txg = 50000;
 boolean_t ddt_prune_artificial_age = B_FALSE;
 boolean_t ddt_dump_prune_histogram = B_FALSE;
 
-/*
- * Don't do more than this many incremental flush passes per txg.
- */
-uint_t zfs_dedup_log_flush_passes_max = 8;
-
 /*
  * Minimum time to flush per txg.
  */
@@ -262,7 +257,32 @@ uint_t zfs_dedup_log_flush_min_time_ms = 1000;
 /*
  * Minimum entries to flush per txg.
  */
-uint_t zfs_dedup_log_flush_entries_min = 1000;
+uint_t zfs_dedup_log_flush_entries_min = 200;
+
+/*
+ * Target number of TXGs until the whole dedup log has been flushed.
+ * The log size will float around this value times the ingest rate.
+ */
+uint_t zfs_dedup_log_flush_txgs = 100;
+
+/*
+ * Maximum entries to flush per txg. Used for testing the dedup log.
+ */
+uint_t zfs_dedup_log_flush_entries_max = UINT_MAX;
+
+/*
+ * Soft cap for the size of the current dedup log. If the log is larger
+ * than this size, we slightly increase the aggressiveness of the flushing to
+ * try to bring it back down to the soft cap.
+ */
+uint_t zfs_dedup_log_cap = UINT_MAX;
+
+/*
+ * If this is set to B_TRUE, the cap above acts more like a hard cap:
+ * flushing is significantly more aggressive, increasing the minimum amount we
+ * flush per txg, as well as the maximum.
+ */
+boolean_t zfs_dedup_log_hard_cap = B_FALSE;
 
 /*
  * Number of txgs to average flow rates across.
@@ -1577,6 +1597,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
+	ddt->ddt_log_flush_pressure = 10;
 
 	ddt_log_alloc(ddt);
 	ddt_table_alloc_kstats(ddt);
@@ -1990,146 +2011,6 @@ _ewma(int32_t val, int32_t prev, uint32_t weight)
 	return (new);
 }
 
-/* Returns true if done for this txg */
-static boolean_t
-ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx)
-{
-	if (ddt->ddt_flush_pass == 0) {
-		if (spa_sync_pass(ddt->ddt_spa) == 1) {
-			/* First run this txg, get set up */
-			ddt->ddt_flush_start = gethrtime();
-			ddt->ddt_flush_count = 0;
-
-			/*
-			 * How many entries we need to flush. We want to at
-			 * least match the ingest rate.
-			 */
-			ddt->ddt_flush_min = MAX(
-			    ddt->ddt_log_ingest_rate,
-			    zfs_dedup_log_flush_entries_min);
-
-			/*
-			 * If we've been asked to flush everything in a hurry,
-			 * try to dump as much as possible on this txg. In
-			 * this case we're only limited by time, not amount.
-			 */
-			if (ddt->ddt_flush_force_txg > 0)
-				ddt->ddt_flush_min =
-				    MAX(ddt->ddt_flush_min, avl_numnodes(
-				    &ddt->ddt_log_flushing->ddl_tree));
-		} else {
-			/* We already decided we're done for this txg */
-			return (B_FALSE);
-		}
-	} else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) {
-		/*
-		 * We already did some flushing on this pass, skip it. This
-		 * happens when dsl_process_async_destroys() runs during a scan
-		 * (on pass 1) and does an additional ddt_sync() to update
-		 * freed blocks.
-		 */
-		return (B_FALSE);
-	}
-
-	if (spa_sync_pass(ddt->ddt_spa) >
-	    MAX(zfs_dedup_log_flush_passes_max, 1)) {
-		/* Too many passes this txg, defer until next. */
-		ddt->ddt_flush_pass = 0;
-		return (B_TRUE);
-	}
-
-	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
-		/* Nothing to flush, done for this txg. */
-		ddt->ddt_flush_pass = 0;
-		return (B_TRUE);
-	}
-
-	uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ?
-	    MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
-	    SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout);
-
-	uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start;
-
-	if (elapsed_time >= target_time) {
-		/* Too long since we started, done for this txg. */
-		ddt->ddt_flush_pass = 0;
-		return (B_TRUE);
-	}
-
-	ddt->ddt_flush_pass++;
-	ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass);
-
-	/*
-	 * Estimate how much time we'll need to flush the remaining entries
-	 * based on how long it normally takes.
-	 */
-	uint32_t want_time;
-	if (ddt->ddt_flush_pass == 1) {
-		/* First pass, use the average time/entries */
-		if (ddt->ddt_log_flush_rate == 0)
-			/* Zero rate, just assume the whole time */
-			want_time = target_time;
-		else
-			want_time = ddt->ddt_flush_min *
-			    ddt->ddt_log_flush_time_rate /
-			    ddt->ddt_log_flush_rate;
-	} else {
-		/* Later pass, calculate from this txg so far */
-		want_time = ddt->ddt_flush_min *
-		    elapsed_time / ddt->ddt_flush_count;
-	}
-
-	/* Figure out how much time we have left */
-	uint32_t remain_time = target_time - elapsed_time;
-
-	/* Smear the remaining entries over the remaining passes. */
-	uint32_t nentries = ddt->ddt_flush_min /
-	    (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass);
-	if (want_time > remain_time) {
-		/*
-		 * We're behind; try to catch up a bit by doubling the amount
-		 * this pass. If we're behind that means we're in a later
-		 * pass and likely have most of the remaining time to
-		 * ourselves. If we're in the last couple of passes, then
-		 * doubling might just take us over the timeout, but probably
-		 * not be much, and it stops us falling behind. If we're
-		 * in the middle passes, there'll be more to do, but it
-		 * might just help us catch up a bit and we'll recalculate on
-		 * the next pass anyway.
-		 */
-		nentries = MIN(ddt->ddt_flush_min, nentries*2);
-	}
-
-	ddt_lightweight_entry_t ddlwe;
-	uint32_t count = 0;
-	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
-		ddt_sync_flush_entry(ddt, &ddlwe,
-		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
-
-		/* End this pass if we've synced as much as we need to. */
-		if (++count >= nentries)
-			break;
-	}
-	ddt->ddt_flush_count += count;
-	ddt->ddt_flush_min -= count;
-
-	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
-		/* We emptied it, so truncate on-disk */
-		DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
-		ddt_log_truncate(ddt, tx);
-		/* No more passes needed this txg */
-		ddt->ddt_flush_pass = 0;
-	} else {
-		/* More to do next time, save checkpoint */
-		DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
-		ddt_log_checkpoint(ddt, &ddlwe, tx);
-	}
-
-	ddt_sync_update_stats(ddt, tx);
-
-	return (ddt->ddt_flush_pass == 0);
-}
-
 static inline void
 ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
 {
@@ -2167,19 +2048,135 @@ ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
 static void
 ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 {
+	spa_t *spa = ddt->ddt_spa;
 	ASSERT(avl_is_empty(&ddt->ddt_tree));
 
-	/* Don't do any flushing when the pool is ready to shut down */
-	if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa))
+	/*
+	 * Don't do any flushing when the pool is ready to shut down, or in
+	 * passes beyond the first.
+	 */
+	if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa))
 		return;
 
-	/* Try to flush some. */
-	if (!ddt_sync_flush_log_incremental(ddt, tx))
-		/* More to do next time */
-		return;
+	hrtime_t flush_start = gethrtime();
+	uint32_t count = 0;
+
+	/*
+	 * How many entries we need to flush. We need to at
+	 * least match the ingest rate, and also consider the
+	 * current backlog of entries.
+	 */
+	uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) +
+	    avl_numnodes(&ddt->ddt_log_active->ddl_tree);
+
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
+		goto housekeeping;
+
+	uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs);
+	uint64_t cap = MAX(1, zfs_dedup_log_cap);
+	uint64_t flush_min = MAX(backlog / txgs,
+	    zfs_dedup_log_flush_entries_min);
+
+	/*
+	 * The theory for this block is that if we increase the pressure while
+	 * we're growing above the cap, and remove it when we're significantly
+	 * below the cap, we'll stay near cap while not bouncing around too
+	 * much.
+	 *
+	 * The factor of 10 is to smooth the pressure effect by expressing it
+	 * in tenths. The addition of the cap to the backlog in the second
+	 * block is to round up, instead of down. We never let the pressure go
+	 * below 1 (10 tenths).
+	 */
+	if (cap != UINT_MAX && backlog > cap &&
+	    backlog > ddt->ddt_log_flush_prev_backlog) {
+		ddt->ddt_log_flush_pressure += 10 * backlog / cap;
+	} else if (cap != UINT_MAX && backlog < cap) {
+		ddt->ddt_log_flush_pressure -=
+		    11 - (((10 * backlog) + cap - 1) / cap);
+		ddt->ddt_log_flush_pressure =
+		    MAX(ddt->ddt_log_flush_pressure, 10);
+	}
+
+	if (zfs_dedup_log_hard_cap && cap != UINT_MAX)
+		flush_min = MAX(flush_min, MIN(backlog - cap,
+		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
+
+	uint64_t flush_max;
+
+	/*
+	 * If we've been asked to flush everything in a hurry,
+	 * try to dump as much as possible on this txg. In
+	 * this case we're only limited by time, not amount.
+	 *
+	 * Otherwise, if we are over the cap, try to get back down to it.
+	 *
+	 * Finally if there is no cap (or no pressure), just set the max a
+	 * little higher than the min to help smooth out variations in flush
+	 * times.
+	 */
+	if (ddt->ddt_flush_force_txg > 0)
+		flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
+	else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap)
+		flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap,
+		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
+	else
+		flush_max = flush_min * 5 / 4;
+	flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max);
+
+	/*
+	 * When the pool is busy or someone is explicitly waiting for this txg
+	 * to complete, use the zfs_dedup_log_flush_min_time_ms.  Otherwise use
+	 * half of the time in the txg timeout.
+	 */
+	uint64_t target_time;
+
+	if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ||
+	    vdev_queue_pool_busy(spa)) {
+		target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
+		    SEC2NSEC(zfs_txg_timeout) / 2);
+	} else {
+		target_time = SEC2NSEC(zfs_txg_timeout) / 2;
+	}
+
+	ddt_lightweight_entry_t ddlwe;
+	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
+		ddt_sync_flush_entry(ddt, &ddlwe,
+		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
+
+		/* End if we've synced as much as we needed to. */
+		if (++count >= flush_max)
+			break;
+
+		/*
+		 * As long as we've flushed the absolute minimum,
+		 * stop if we're way over our target time.
+		 */
+		uint64_t diff = gethrtime() - flush_start;
+		if (count > zfs_dedup_log_flush_entries_min &&
+		    diff >= target_time * 2)
+			break;
 
-	/* No more flushing this txg, so we can do end-of-txg housekeeping */
+		/*
+		 * End if we've passed the minimum flush and we're out of time.
+		 */
+		if (count > flush_min && diff >= target_time)
+			break;
+	}
 
+	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
+		/* We emptied it, so truncate on-disk */
+		DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
+		ddt_log_truncate(ddt, tx);
+	} else {
+		/* More to do next time, save checkpoint */
+		DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
+		ddt_log_checkpoint(ddt, &ddlwe, tx);
+	}
+
+	ddt_sync_update_stats(ddt, tx);
+
+housekeeping:
 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
 	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
 		/*
@@ -2196,12 +2193,13 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 	/* If force flush is no longer necessary, turn it off. */
 	ddt_flush_force_update_txg(ddt, 0);
 
+	ddt->ddt_log_flush_prev_backlog = backlog;
+
 	/*
-	 * Update flush rate. This is an exponential weighted moving average of
-	 * the number of entries flushed over recent txgs.
+	 * Update flush rate. This is an exponential weighted moving
+	 * average of the number of entries flushed over recent txgs.
 	 */
-	ddt->ddt_log_flush_rate = _ewma(
-	    ddt->ddt_flush_count, ddt->ddt_log_flush_rate,
+	ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate,
 	    zfs_dedup_log_flush_flow_rate_txgs);
 	DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
 
@@ -2209,12 +2207,21 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 	 * Update flush time rate. This is an exponential weighted moving
 	 * average of the total time taken to flush over recent txgs.
 	 */
-	ddt->ddt_log_flush_time_rate = _ewma(
-	    ddt->ddt_log_flush_time_rate,
-	    ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))),
+	ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate,
+	    (int32_t)NSEC2MSEC(gethrtime() - flush_start),
 	    zfs_dedup_log_flush_flow_rate_txgs);
 	DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
 	    ddt->ddt_log_flush_time_rate);
+	if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 &&
+	    zfs_flags & ZFS_DEBUG_DDT) {
+		zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ "
+		    "txg %llu, in %llu ms, flush rate %d, time rate %d",
+		    (ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree),
+		    (ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree),
+		    count, (u_longlong_t)tx->tx_txg,
+		    (u_longlong_t)NSEC2MSEC(gethrtime() - flush_start),
+		    ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate);
+	}
 }
 
 static void
@@ -2762,14 +2769,23 @@ ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
 
-ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW,
-	"Max number of incremental dedup log flush passes per transaction");
-
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
 	"Min time to spend on incremental dedup log flush each transaction");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
 	"Min number of log entries to flush each transaction");
 
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW,
+	"Max number of log entries to flush each transaction");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW,
+	"Number of TXGs to try to rotate the log in");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW,
+	"Soft cap for the size of the current dedup log");
+
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW,
+	"Whether to use the soft cap as a hard cap");
+
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
 	"Number of txgs to average flow rates across");
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 092b3f375be0..79d23a9b03c1 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -1049,6 +1049,16 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 	mutex_exit(&vq->vq_lock);
 }
 
+boolean_t
+vdev_queue_pool_busy(spa_t *spa)
+{
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	uint64_t min_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_min_dirty_percent / 100;
+
+	return (dp->dp_dirty_total > min_bytes);
+}
+
 /*
  * As these two methods are only used for load calculations we're not
  * concerned if we get an incorrect value on 32bit platforms due to lack of
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 462704b593c3..b2a9c93514ae 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -681,8 +681,8 @@ post =
 tags = ['functional', 'deadman']
 
 [tests/functional/dedup]
-tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_legacy_create',
-    'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
+tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing',
+    'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade',
     'dedup_legacy_fdt_mixed', 'dedup_quota', 'dedup_prune', 'dedup_zap_shrink']
 pre =
 post =
diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg
index 2024c44cc138..0a546dd44553 100644
--- a/tests/zfs-tests/include/tunables.cfg
+++ b/tests/zfs-tests/include/tunables.cfg
@@ -32,6 +32,8 @@ DDT_ZAP_DEFAULT_BS		dedup.ddt_zap_default_bs	ddt_zap_default_bs
 DDT_ZAP_DEFAULT_IBS		dedup.ddt_zap_default_ibs	ddt_zap_default_ibs
 DDT_DATA_IS_SPECIAL		ddt_data_is_special		zfs_ddt_data_is_special
 DEDUP_LOG_TXG_MAX		dedup.log_txg_max		zfs_dedup_log_txg_max
+DEDUP_LOG_FLUSH_ENTRIES_MAX	dedup.log_flush_entries_max	zfs_dedup_log_flush_entries_max
+DEDUP_LOG_FLUSH_ENTRIES_MIN	dedup.log_flush_entries_min	zfs_dedup_log_flush_entries_min
 DEADMAN_CHECKTIME_MS		deadman.checktime_ms		zfs_deadman_checktime_ms
 DEADMAN_EVENTS_PER_SECOND	deadman_events_per_second	zfs_deadman_events_per_second
 DEADMAN_FAILMODE		deadman.failmode		zfs_deadman_failmode
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index 6e34063ad3cc..54985a20ab38 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -1440,6 +1440,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/dedup/setup.ksh \
 	functional/dedup/dedup_fdt_create.ksh \
 	functional/dedup/dedup_fdt_import.ksh \
+	functional/dedup/dedup_fdt_pacing.ksh \
 	functional/dedup/dedup_legacy_create.ksh \
 	functional/dedup/dedup_legacy_import.ksh \
 	functional/dedup/dedup_legacy_fdt_upgrade.ksh \
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh
new file mode 100755
index 000000000000..8cbc93d6eb74
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh
@@ -0,0 +1,109 @@
+#!/bin/ksh -p
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2025 Klara, Inc.
+#
+
+# Ensure dedup log flushes are appropriately paced
+
+. $STF_SUITE/include/libtest.shlib
+
+log_assert "dedup (FDT) paces out log entries appropriately"
+
+function get_ddt_log_entries
+{
+	zdb -D $TESTPOOL | grep -- "-log-sha256-" | sed 's/.*entries=//' | \
+	    awk '{sum += $1} END {print sum}'
+}
+
+function cleanup
+{
+	if poolexists $TESTPOOL; then
+		destroy_pool $TESTPOOL
+	fi
+	log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MAX
+}
+
+log_onexit cleanup
+
+# Create a pool with fast dedup enabled. We disable block cloning to ensure
+# it doesn't get in the way of dedup.
+log_must zpool create -f \
+    -o feature@fast_dedup=enabled \
+    -o feature@block_cloning=disabled \
+    $TESTPOOL $DISKS
+
+# Create a filesystem with a small recordsize so that we get more DDT entries,
+# disable compression so our writes create predictable results on disk, and
+# use 'xattr=sa' to prevent selinux xattrs influencing our accounting
+log_must zfs create \
+    -o dedup=on \
+    -o compression=off \
+    -o xattr=sa \
+    -o checksum=sha256 \
+    -o recordsize=4k $TESTPOOL/fs
+
+# Set the dedup log to only flush a single entry per txg.
+# It's hard to guarantee that exactly one flush will happen per txg, or that
+# we don't miss a txg due to weird latency or anything, so we build some
+# wiggle room into subsequent checks.
+
+log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MAX
+log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MAX 1
+
+# Create a file. This is 256 full blocks, so will produce 256 entries in the
+# dedup log.
+log_must dd if=/dev/urandom of=/$TESTPOOL/fs/file1 bs=128k count=8
+sync_pool
+
+# Verify there are at least 240 entries in the dedup log.
+log_entries=$(get_ddt_log_entries)
+[[ "$log_entries" -gt 240 ]] || \
+    log_fail "Fewer than 240 entries in dedup log: $log_entries"
+
+# Wait for 5 TXGs to sync.
+for i in `seq 1 5`; do
+	sync_pool
+done
+
+# Verify there are at least 220 entries in the dedup log.
+log_entries2=$(get_ddt_log_entries)
+[[ $((log_entries - log_entries2)) -lt 20 ]] || \
+    log_fail "Too many entries pruned from dedup log: " \
+    "from $log_entries to $log_entries2"
+[[ $((log_entries - log_entries2)) -gt 5 ]] || \
+    log_fail "Too few entries pruned from dedup log: " \
+    "from $log_entries to $log_entries2"
+
+# Set the log flush rate high enough to clear the whole list.
+log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MAX 1024
+sync_pool
+
+# Verify there are 0 entries in the dedup log.
+log_entries3=$(get_ddt_log_entries)
+[[ "$log_entries3" -eq 0 ]] || \
+    log_fail "Entries still present in dedup log: $log_entries3"
+
+# Verify there are 256 entries in the unique table.
+log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=256'"
+
+log_pass "dedup (FDT) paces out log entries appropriately"
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh
index 44dbecafd195..36d12f7e18aa 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_prune.ksh
@@ -47,13 +47,15 @@ log_assert "Verify DDT pruning correctly removes non-duplicate entries"
 # entries appear in the DDT ZAP
 log_must save_tunable DEDUP_LOG_TXG_MAX
 log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
-
+log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
+log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000
 function cleanup
 {
 	if poolexists $TESTPOOL ; then
 		destroy_pool $TESTPOOL
 	fi
 	log_must restore_tunable DEDUP_LOG_TXG_MAX
+	log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
 }
 
 function ddt_entries
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
index b1657648b5a1..39e46ce1c569 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh
@@ -56,6 +56,8 @@ save_tunable TXG_TIMEOUT
 # where things appear on-disk
 log_must save_tunable DEDUP_LOG_TXG_MAX
 log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
+log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000
 
 function cleanup
 {
@@ -65,6 +67,7 @@ function cleanup
 	log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR
 	log_must restore_tunable TXG_TIMEOUT
 	log_must restore_tunable DEDUP_LOG_TXG_MAX
+	log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
 }
 
 
diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh
index 5f2352937745..ec6d730cf0c9 100755
--- a/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh
+++ b/tests/zfs-tests/tests/functional/dedup/dedup_zap_shrink.ksh
@@ -43,6 +43,8 @@ log_assert "Create a large number of entries in the DDT. " \
 # entries appear in the DDT ZAP
 log_must save_tunable DEDUP_LOG_TXG_MAX
 log_must set_tunable32 DEDUP_LOG_TXG_MAX 1
+log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
+log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000
 
 function cleanup
 {
@@ -50,6 +52,7 @@ function cleanup
 		destroy_pool $TESTPOOL
 	fi
 	log_must restore_tunable DEDUP_LOG_TXG_MAX
+	log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN
 }
 
 log_onexit cleanup