From 8ec2f0cbb69368387d57dbbc30b5093b4c8ac8b3 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Fri, 31 Jan 2025 09:55:35 -0800 Subject: [PATCH 1/3] FDT dedup log sync -- remove incremental This PR condenses the FDT dedup log syncing into a single sync pass. This reduces the overhead of modifying indirect blocks for the dedup table multiple times per txg. In addition, changes were made to the formula for how much to sync per txg. We now also consider the backlog we have to clear, to prevent it from growing too large, or remaining large on an idle system. Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Authored-by: Don Brady Authored-by: Paul Dagnelie Signed-off-by: Paul Dagnelie --- include/sys/ddt.h | 7 +- include/sys/vdev.h | 1 + include/sys/zfs_debug.h | 1 + man/man4/zfs.4 | 71 ++-- module/zfs/ddt.c | 342 +++++++++--------- module/zfs/vdev_queue.c | 10 + tests/runfiles/common.run | 4 +- tests/zfs-tests/include/tunables.cfg | 2 + tests/zfs-tests/tests/Makefile.am | 1 + .../functional/dedup/dedup_fdt_pacing.ksh | 107 ++++++ .../tests/functional/dedup/dedup_quota.ksh | 3 + 11 files changed, 351 insertions(+), 198 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 4e5ccd46318e..17675c2e28eb 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -285,14 +285,11 @@ typedef struct { ddt_log_t *ddt_log_active; /* pointers into ddt_log */ ddt_log_t *ddt_log_flushing; /* swapped when flush starts */ - hrtime_t ddt_flush_start; /* log flush start this txg */ - uint32_t ddt_flush_pass; /* log flush pass this txg */ - - int32_t ddt_flush_count; /* entries flushed this txg */ - int32_t ddt_flush_min; /* min rem entries to flush */ int32_t ddt_log_ingest_rate; /* rolling log ingest rate */ int32_t ddt_log_flush_rate; /* rolling log flush rate */ int32_t ddt_log_flush_time_rate; /* avg time spent flushing */ + uint32_t ddt_log_flush_pressure; /* pressure to apply for cap */ + uint32_t ddt_log_flush_prev_backlog; /* prev backlog size */ uint64_t ddt_flush_force_txg; /* flush hard before this txg */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 38f62b07dc59..744717f8f21b 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -171,6 +171,7 @@ extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); extern uint32_t vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); +extern boolean_t vdev_queue_pool_busy(spa_t *spa); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index e509c8b7c638..428563a91263 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -59,6 +59,7 @@ extern int zfs_dbgmsg_enable; #define ZFS_DEBUG_METASLAB_ALLOC (1 << 13) #define ZFS_DEBUG_BRT (1 << 14) #define ZFS_DEBUG_RAIDZ_RECONSTRUCT (1 << 15) +#define ZFS_DEBUG_DDT (1 << 16) extern void __set_error(const char *file, const char *func, int line, int err); extern void __zfs_dbgmsg(char *buf); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index dd0b3d848fe9..c3807d7b99b3 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1026,27 +1026,6 @@ milliseconds until the operation completes. .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int Enable prefetching dedup-ed blocks which are going to be freed. . -.It Sy zfs_dedup_log_flush_passes_max Ns = Ns Sy 8 Ns Pq uint -Maximum number of dedup log flush passes (iterations) each transaction. -.Pp -At the start of each transaction, OpenZFS will estimate how many entries it -needs to flush out to keep up with the change rate, taking the amount and time -taken to flush on previous txgs into account (see -.Sy zfs_dedup_log_flush_flow_rate_txgs ) . -It will spread this amount into a number of passes. -At each pass, it will use the amount already flushed and the total time taken -by flushing and by other IO to recompute how much it should do for the remainder -of the txg. -.Pp -Reducing the max number of passes will make flushing more aggressive, flushing -out more entries on each pass. -This can be faster, but also more likely to compete with other IO. -Increasing the max number of passes will put fewer entries onto each pass, -keeping the overhead of dedup changes to a minimum but possibly causing a large -number of changes to be dumped on the last pass, which can blow out the txg -sync time beyond -.Sy zfs_txg_timeout . -. .It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint Minimum time to spend on dedup log flush each transaction. .Pp @@ -1056,22 +1035,58 @@ up to This occurs even if doing so would delay the transaction, that is, other IO completes under this time. . -.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 1000 Ns Pq uint +.It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 100 Ns Pq uint Flush at least this many entries each transaction. .Pp -OpenZFS will estimate how many entries it needs to flush each transaction to -keep up with the ingest rate (see -.Sy zfs_dedup_log_flush_flow_rate_txgs ) . -This sets the minimum for that estimate. +OpenZFS will a fraction of the log every TXG, to keep the size proportional +to the ingest rate (see +.Sy zfs_dedup_log_flush_txgs) . +This sets the minimum for that estimate, which prevents the backlog from +never completely draining if the ingest rate falls. Raising it can force OpenZFS to flush more aggressively, keeping the log small and so reducing pool import times, but can make it less able to back off if log flushing would compete with other IO too much. . +.It Sy zfs_dedup_log_flush_entries_max Ns = Ns Sy UINT_MAX Ns Pq uint +Flush at most this many entries each transaction. +.Pp +Mostly used for debugging purposes. +.It Sy zfs_dedup_log_flush_txgs Ns = Ns Sy 100 Ns Pq uint +Target number of TXGs to process the whole dedup log. +.Pp +Every TXG, OpenZFS will process the inverse of this number times the size +of the DDT backlog. +This will keep the backlog at a size roughly equal to the ingest rate +times this value. +This offers a balance between a more efficient DDT log, with better +aggregation, and shorter import times, which increase as the size of the +DDT log increases. +Increasing this value will result in a more efficient DDT log, but longer +import times. +.It Sy zfs_dedup_log_cap Ns = Ns Sy UINT_MAX Ns Pq uint +Soft cap for the size of the current dedup log. +.Pp +If the log is larger than this size, we increase the aggressiveness of +the flushing to try to bring it back down to the soft cap. +Setting it will reduce import times, but will reduce the efficiency of +the DDT log, increasing the expected number of IOs required to flush the same +amount of data. +.It Sy zfs_dedup_log_hard_cap Ns = Ns Sy 0 Ns | Ns 1 Pq int +Whether to treat the log cap as a firm cap or not. +.Pp +When set to 0 (the default), the +.Sy zfs_dedup_log_cap +will increase the maximum number of log entries we flush in a given txg. +This will bring the backlog size down towards the cap, but not at the expense +of making TXG syncs take longer. +If this is set to 1, the cap acts more like a hard cap than a soft cap; it will +also increase the minimum number of log entries we flush per TXG. +Enabling it will reduce worst-case import times, at the cost of increased TXG +sync times. .It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint Number of transactions to use to compute the flow rate. .Pp -OpenZFS will estimate how many entries it needs to flush each transaction by -monitoring the number of entries changed (ingest rate), number of entries +OpenZFS will estimate number of entries changed (ingest rate), number of entries flushed (flush rate) and time spent flushing (flush time rate) and combining these into an overall "flow rate". It will use an exponential weighted moving average over some number of recent diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index e8b9fb498d4b..a86cb850fa3f 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -249,11 +249,6 @@ static uint32_t zfs_ddt_prunes_per_txg = 50000; boolean_t ddt_prune_artificial_age = B_FALSE; boolean_t ddt_dump_prune_histogram = B_FALSE; -/* - * Don't do more than this many incremental flush passes per txg. - */ -uint_t zfs_dedup_log_flush_passes_max = 8; - /* * Minimum time to flush per txg. */ @@ -262,7 +257,32 @@ uint_t zfs_dedup_log_flush_min_time_ms = 1000; /* * Minimum entries to flush per txg. */ -uint_t zfs_dedup_log_flush_entries_min = 1000; +uint_t zfs_dedup_log_flush_entries_min = 200; + +/* + * Target number of TXGs until the whole dedup log has been flushed. + * The log size will float around this value times the ingest rate. + */ +uint_t zfs_dedup_log_flush_txgs = 100; + +/* + * Maximum entries to flush per txg. Used for testing the dedup log. + */ +uint_t zfs_dedup_log_flush_entries_max = UINT_MAX; + +/* + * Soft cap for the size of the current dedup log. If the log is larger + * than this size, we slightly increase the aggressiveness of the flushing to + * try to bring it back down to the soft cap. + */ +uint_t zfs_dedup_log_cap = UINT_MAX; + +/* + * If this is set to B_TRUE, the cap above acts more like a hard cap: + * flushing is significantly more aggressive, increasing the minimum amount we + * flush per txg, as well as the maximum. + */ +boolean_t zfs_dedup_log_hard_cap = B_FALSE; /* * Number of txgs to average flow rates across. @@ -1577,6 +1597,7 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) ddt->ddt_spa = spa; ddt->ddt_os = spa->spa_meta_objset; ddt->ddt_version = DDT_VERSION_UNCONFIGURED; + ddt->ddt_log_flush_pressure = 10; ddt_log_alloc(ddt); ddt_table_alloc_kstats(ddt); @@ -1990,146 +2011,6 @@ _ewma(int32_t val, int32_t prev, uint32_t weight) return (new); } -/* Returns true if done for this txg */ -static boolean_t -ddt_sync_flush_log_incremental(ddt_t *ddt, dmu_tx_t *tx) -{ - if (ddt->ddt_flush_pass == 0) { - if (spa_sync_pass(ddt->ddt_spa) == 1) { - /* First run this txg, get set up */ - ddt->ddt_flush_start = gethrtime(); - ddt->ddt_flush_count = 0; - - /* - * How many entries we need to flush. We want to at - * least match the ingest rate. - */ - ddt->ddt_flush_min = MAX( - ddt->ddt_log_ingest_rate, - zfs_dedup_log_flush_entries_min); - - /* - * If we've been asked to flush everything in a hurry, - * try to dump as much as possible on this txg. In - * this case we're only limited by time, not amount. - */ - if (ddt->ddt_flush_force_txg > 0) - ddt->ddt_flush_min = - MAX(ddt->ddt_flush_min, avl_numnodes( - &ddt->ddt_log_flushing->ddl_tree)); - } else { - /* We already decided we're done for this txg */ - return (B_FALSE); - } - } else if (ddt->ddt_flush_pass == spa_sync_pass(ddt->ddt_spa)) { - /* - * We already did some flushing on this pass, skip it. This - * happens when dsl_process_async_destroys() runs during a scan - * (on pass 1) and does an additional ddt_sync() to update - * freed blocks. - */ - return (B_FALSE); - } - - if (spa_sync_pass(ddt->ddt_spa) > - MAX(zfs_dedup_log_flush_passes_max, 1)) { - /* Too many passes this txg, defer until next. */ - ddt->ddt_flush_pass = 0; - return (B_TRUE); - } - - if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { - /* Nothing to flush, done for this txg. */ - ddt->ddt_flush_pass = 0; - return (B_TRUE); - } - - uint64_t target_time = txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ? - MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms), - SEC2NSEC(zfs_txg_timeout)) : SEC2NSEC(zfs_txg_timeout); - - uint64_t elapsed_time = gethrtime() - ddt->ddt_flush_start; - - if (elapsed_time >= target_time) { - /* Too long since we started, done for this txg. */ - ddt->ddt_flush_pass = 0; - return (B_TRUE); - } - - ddt->ddt_flush_pass++; - ASSERT3U(spa_sync_pass(ddt->ddt_spa), ==, ddt->ddt_flush_pass); - - /* - * Estimate how much time we'll need to flush the remaining entries - * based on how long it normally takes. - */ - uint32_t want_time; - if (ddt->ddt_flush_pass == 1) { - /* First pass, use the average time/entries */ - if (ddt->ddt_log_flush_rate == 0) - /* Zero rate, just assume the whole time */ - want_time = target_time; - else - want_time = ddt->ddt_flush_min * - ddt->ddt_log_flush_time_rate / - ddt->ddt_log_flush_rate; - } else { - /* Later pass, calculate from this txg so far */ - want_time = ddt->ddt_flush_min * - elapsed_time / ddt->ddt_flush_count; - } - - /* Figure out how much time we have left */ - uint32_t remain_time = target_time - elapsed_time; - - /* Smear the remaining entries over the remaining passes. */ - uint32_t nentries = ddt->ddt_flush_min / - (MAX(1, zfs_dedup_log_flush_passes_max) + 1 - ddt->ddt_flush_pass); - if (want_time > remain_time) { - /* - * We're behind; try to catch up a bit by doubling the amount - * this pass. If we're behind that means we're in a later - * pass and likely have most of the remaining time to - * ourselves. If we're in the last couple of passes, then - * doubling might just take us over the timeout, but probably - * not be much, and it stops us falling behind. If we're - * in the middle passes, there'll be more to do, but it - * might just help us catch up a bit and we'll recalculate on - * the next pass anyway. - */ - nentries = MIN(ddt->ddt_flush_min, nentries*2); - } - - ddt_lightweight_entry_t ddlwe; - uint32_t count = 0; - while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) { - ddt_sync_flush_entry(ddt, &ddlwe, - ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx); - - /* End this pass if we've synced as much as we need to. */ - if (++count >= nentries) - break; - } - ddt->ddt_flush_count += count; - ddt->ddt_flush_min -= count; - - if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { - /* We emptied it, so truncate on-disk */ - DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries); - ddt_log_truncate(ddt, tx); - /* No more passes needed this txg */ - ddt->ddt_flush_pass = 0; - } else { - /* More to do next time, save checkpoint */ - DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count); - ddt_log_checkpoint(ddt, &ddlwe, tx); - } - - ddt_sync_update_stats(ddt, tx); - - return (ddt->ddt_flush_pass == 0); -} - static inline void ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg) { @@ -2167,19 +2048,135 @@ ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg) static void ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) { + spa_t *spa = ddt->ddt_spa; ASSERT(avl_is_empty(&ddt->ddt_tree)); - /* Don't do any flushing when the pool is ready to shut down */ - if (tx->tx_txg > spa_final_dirty_txg(ddt->ddt_spa)) + /* + * Don't do any flushing when the pool is ready to shut down, or in + * passes beyond the first. + */ + if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa)) return; - /* Try to flush some. */ - if (!ddt_sync_flush_log_incremental(ddt, tx)) - /* More to do next time */ - return; + hrtime_t flush_start = gethrtime(); + uint32_t count = 0; + + /* + * How many entries we need to flush. We need to at + * least match the ingest rate, and also consider the + * current backlog of entries. + */ + uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) + + avl_numnodes(&ddt->ddt_log_active->ddl_tree); + + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) + goto housekeeping; + + uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs); + uint64_t cap = MAX(1, zfs_dedup_log_cap); + uint64_t flush_min = MAX(backlog / txgs, + zfs_dedup_log_flush_entries_min); + + /* + * The theory for this block is that if we increase the pressure while + * we're growing above the cap, and remove it when we're significantly + * below the cap, we'll stay near cap while not bouncing around too + * much. + * + * The factor of 10 is to smooth the pressure effect by expressing it + * in tenths. The addition of the cap to the backlog in the second + * block is to round up, instead of down. We never let the pressure go + * below 1 (10 tenths). + */ + if (cap != UINT32_MAX && backlog > cap && + backlog > ddt->ddt_log_flush_prev_backlog) { + ddt->ddt_log_flush_pressure += 10 * backlog / cap; + } else if (cap != UINT32_MAX && backlog < cap) { + ddt->ddt_log_flush_pressure -= + 11 - (((10 * backlog) + cap - 1) / cap); + ddt->ddt_log_flush_pressure = + MAX(ddt->ddt_log_flush_pressure, 10); + } + + if (zfs_dedup_log_hard_cap && cap != UINT32_MAX) + flush_min = MAX(flush_min, MIN(backlog - cap, + (flush_min * ddt->ddt_log_flush_pressure) / 10)); + + uint64_t flush_max; + + /* + * If we've been asked to flush everything in a hurry, + * try to dump as much as possible on this txg. In + * this case we're only limited by time, not amount. + * + * Otherwise, if we are over the cap, try to get back down to it. + * + * Finally if there is no cap (or no pressure), just set the max a + * little higher than the min to help smooth out variations in flush + * times. + */ + if (ddt->ddt_flush_force_txg > 0) + flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree); + else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap) + flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap, + (flush_min * ddt->ddt_log_flush_pressure) / 10)); + else + flush_max = flush_min * 5 / 4; + flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max); + + /* + * When the pool is busy or someone is explicitly waiting for this txg + * to complete, use the zfs_dedup_log_flush_min_time_ms. Otherwise use + * half of the time in the txg timeout. + */ + uint64_t target_time; + + if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) || + vdev_queue_pool_busy(spa)) { + target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms), + SEC2NSEC(zfs_txg_timeout) / 2); + } else { + target_time = SEC2NSEC(zfs_txg_timeout) / 2; + } + + ddt_lightweight_entry_t ddlwe; + while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) { + ddt_sync_flush_entry(ddt, &ddlwe, + ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx); + + /* End if we've synced as much as we needed to. */ + if (++count >= flush_max) + break; + + /* + * As long as we've flushed the absolute minimum, + * stop if we're way over our target time. + */ + uint64_t diff = gethrtime() - flush_start; + if (count > zfs_dedup_log_flush_entries_min && + diff >= target_time * 2) + break; - /* No more flushing this txg, so we can do end-of-txg housekeeping */ + /* + * End if we've passed the minimum flush and we're out of time. + */ + if (count > flush_min && diff >= target_time) + break; + } + if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) { + /* We emptied it, so truncate on-disk */ + DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries); + ddt_log_truncate(ddt, tx); + } else { + /* More to do next time, save checkpoint */ + DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count); + ddt_log_checkpoint(ddt, &ddlwe, tx); + } + + ddt_sync_update_stats(ddt, tx); + +housekeeping: if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) && !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) { /* @@ -2196,12 +2193,13 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) /* If force flush is no longer necessary, turn it off. */ ddt_flush_force_update_txg(ddt, 0); + ddt->ddt_log_flush_prev_backlog = backlog; + /* - * Update flush rate. This is an exponential weighted moving average of - * the number of entries flushed over recent txgs. + * Update flush rate. This is an exponential weighted moving + * average of the number of entries flushed over recent txgs. */ - ddt->ddt_log_flush_rate = _ewma( - ddt->ddt_flush_count, ddt->ddt_log_flush_rate, + ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate, zfs_dedup_log_flush_flow_rate_txgs); DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate); @@ -2209,12 +2207,21 @@ ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx) * Update flush time rate. This is an exponential weighted moving * average of the total time taken to flush over recent txgs. */ - ddt->ddt_log_flush_time_rate = _ewma( - ddt->ddt_log_flush_time_rate, - ((int32_t)(NSEC2MSEC(gethrtime() - ddt->ddt_flush_start))), + ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate, + (int32_t)NSEC2MSEC(gethrtime() - flush_start), zfs_dedup_log_flush_flow_rate_txgs); DDT_KSTAT_SET(ddt, dds_log_flush_time_rate, ddt->ddt_log_flush_time_rate); + if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 && + zfs_flags & ZFS_DEBUG_DDT) { + zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ " + "txg %llu, in %llu ms, flush rate %d, time rate %d", + (ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree), + (ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree), + count, (u_longlong_t)tx->tx_txg, + (u_longlong_t)NSEC2MSEC(gethrtime() - flush_start), + ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate); + } } static void @@ -2762,14 +2769,23 @@ ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); -ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_passes_max, UINT, ZMOD_RW, - "Max number of incremental dedup log flush passes per transaction"); - ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW, "Min time to spend on incremental dedup log flush each transaction"); ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW, "Min number of log entries to flush each transaction"); +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW, + "Max number of log entries to flush each transaction"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW, + "Number of TXGs to try to rotate the log in"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW, + "Soft cap for the size of the current dedup log"); + +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, INT, ZMOD_RW, + "Whether to use the soft cap as a hard cap"); + ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW, "Number of txgs to average flow rates across"); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 092b3f375be0..79d23a9b03c1 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -1049,6 +1049,16 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) mutex_exit(&vq->vq_lock); } +boolean_t +vdev_queue_pool_busy(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + uint64_t min_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_min_dirty_percent / 100; + + return (dp->dp_dirty_total > min_bytes); +} + /* * As these two methods are only used for load calculations we're not * concerned if we get an incorrect value on 32bit platforms due to lack of diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index e2edfc9ebbb5..02209fc5f120 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -676,8 +676,8 @@ post = tags = ['functional', 'deadman'] [tests/functional/dedup] -tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_legacy_create', - 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', +tests = ['dedup_fdt_create', 'dedup_fdt_import', 'dedup_fdt_pacing', + 'dedup_legacy_create', 'dedup_legacy_import', 'dedup_legacy_fdt_upgrade', 'dedup_legacy_fdt_mixed', 'dedup_quota'] pre = post = diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 2024c44cc138..0a546dd44553 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -32,6 +32,8 @@ DDT_ZAP_DEFAULT_BS dedup.ddt_zap_default_bs ddt_zap_default_bs DDT_ZAP_DEFAULT_IBS dedup.ddt_zap_default_ibs ddt_zap_default_ibs DDT_DATA_IS_SPECIAL ddt_data_is_special zfs_ddt_data_is_special DEDUP_LOG_TXG_MAX dedup.log_txg_max zfs_dedup_log_txg_max +DEDUP_LOG_FLUSH_ENTRIES_MAX dedup.log_flush_entries_max zfs_dedup_log_flush_entries_max +DEDUP_LOG_FLUSH_ENTRIES_MIN dedup.log_flush_entries_min zfs_dedup_log_flush_entries_min DEADMAN_CHECKTIME_MS deadman.checktime_ms zfs_deadman_checktime_ms DEADMAN_EVENTS_PER_SECOND deadman_events_per_second zfs_deadman_events_per_second DEADMAN_FAILMODE deadman.failmode zfs_deadman_failmode diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index dcefb26a4036..8db0337daabf 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1437,6 +1437,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/dedup/setup.ksh \ functional/dedup/dedup_fdt_create.ksh \ functional/dedup/dedup_fdt_import.ksh \ + functional/dedup/dedup_fdt_pacing.ksh \ functional/dedup/dedup_legacy_create.ksh \ functional/dedup/dedup_legacy_import.ksh \ functional/dedup/dedup_legacy_fdt_upgrade.ksh \ diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh new file mode 100755 index 000000000000..58af546e19d2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/dedup/dedup_fdt_pacing.ksh @@ -0,0 +1,107 @@ +#!/bin/ksh -p +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2025 Klara, Inc. +# + +# Ensure dedup log flushes are appropriately paced + +. $STF_SUITE/include/libtest.shlib + +log_assert "dedup (FDT) paces out log entries appropriately" + +function get_ddt_log_entries +{ + zdb -D $TESTPOOL | grep -- "-log-sha256-" | sed 's/.*entries=//' | \ + awk '{sum += $1} END {print sum}' +} + +function cleanup +{ + destroy_pool $TESTPOOL + log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MAX +} + +log_onexit cleanup + +# Create a pool with fast dedup enabled. We disable block cloning to ensure +# it doesn't get in the way of dedup. +log_must zpool create -f \ + -o feature@fast_dedup=enabled \ + -o feature@block_cloning=disabled \ + $TESTPOOL $DISKS + +# Create a filesystem with a small recordsize so that we get more DDT entries, +# disable compression so our writes create predictable results on disk, and +# use 'xattr=sa' to prevent selinux xattrs influencing our accounting +log_must zfs create \ + -o dedup=on \ + -o compression=off \ + -o xattr=sa \ + -o checksum=sha256 \ + -o recordsize=4k $TESTPOOL/fs + +# Set the dedup log to only flush a single entry per txg. +# It's hard to guarantee that exactly one flush will happen per txg, or that +# we don't miss a txg due to weird latency or anything, so we build some +# wiggle room into subsequent checks. + +log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MAX +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MAX 1 + +# Create a file. This is 256 full blocks, so will produce 256 entries in the +# dedup log. +log_must dd if=/dev/urandom of=/$TESTPOOL/fs/file1 bs=128k count=8 +sync_pool + +# Verify there are at least 240 entries in the dedup log. +log_entries=$(get_ddt_log_entries) +[[ "$log_entries" -gt 240 ]] || \ + log_fail "Fewer than 240 entries in dedup log: $log_entries" + +# Wait for 5 TXGs to sync. +for i in `seq 1 5`; do + sync_pool +done + +# Verify there are at least 220 entries in the dedup log. +log_entries2=$(get_ddt_log_entries) +[[ $((log_entries - log_entries2)) -lt 20 ]] || \ + log_fail "Too many entries pruned from dedup log: " \ + "from $log_entries to $log_entries2" +[[ $((log_entries - log_entries2)) -gt 5 ]] || \ + log_fail "Too few entries pruned from dedup log: " \ + "from $log_entries to $log_entries2" + +# Set the log flush rate high enough to clear the whole list. +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MAX 1024 +sync_pool + +# Verify there are 0 entries in the dedup log. +log_entries3=$(get_ddt_log_entries) +[[ "$log_entries3" -eq 0 ]] || \ + log_fail "Entries still present in dedup log: $log_entries3" + +# Verify there are 256 entries in the unique table. +log_must eval "zdb -D $TESTPOOL | grep -q 'DDT-sha256-zap-unique:.*entries=256'" + +log_pass "dedup (FDT) paces out log entries appropriately" diff --git a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh index b1657648b5a1..39e46ce1c569 100755 --- a/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh +++ b/tests/zfs-tests/tests/functional/dedup/dedup_quota.ksh @@ -56,6 +56,8 @@ save_tunable TXG_TIMEOUT # where things appear on-disk log_must save_tunable DEDUP_LOG_TXG_MAX log_must set_tunable32 DEDUP_LOG_TXG_MAX 1 +log_must save_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN +log_must set_tunable32 DEDUP_LOG_FLUSH_ENTRIES_MIN 100000 function cleanup { @@ -65,6 +67,7 @@ function cleanup log_must rm -fd $VDEV_GENERAL $VDEV_DEDUP $MOUNTDIR log_must restore_tunable TXG_TIMEOUT log_must restore_tunable DEDUP_LOG_TXG_MAX + log_must restore_tunable DEDUP_LOG_FLUSH_ENTRIES_MIN } From 037537ff1ae06009368a2e2bf041a86b9e4f8230 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 10 Feb 2025 09:16:38 -0800 Subject: [PATCH 2/3] style Signed-off-by: Paul Dagnelie --- man/man4/zfs.4 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index c3807d7b99b3..d6ed6f8e60cc 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1040,12 +1040,12 @@ Flush at least this many entries each transaction. .Pp OpenZFS will a fraction of the log every TXG, to keep the size proportional to the ingest rate (see -.Sy zfs_dedup_log_flush_txgs) . +.Sy zfs_dedup_log_flush_txgs ) . This sets the minimum for that estimate, which prevents the backlog from never completely draining if the ingest rate falls. -Raising it can force OpenZFS to flush more aggressively, keeping the log small -and so reducing pool import times, but can make it less able to back off if -log flushing would compete with other IO too much. +Raising it can force OpenZFS to flush more aggressively, reducing the backlog +to zero more quickly, but can make it less able to back off if log +flushing would compete with other IO too much. . .It Sy zfs_dedup_log_flush_entries_max Ns = Ns Sy UINT_MAX Ns Pq uint Flush at most this many entries each transaction. From b21b18f5ed329cceeb63e94acaf41211c16d4e62 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Tue, 11 Feb 2025 10:30:34 -0800 Subject: [PATCH 3/3] tony hutter feedback Signed-off-by: Paul Dagnelie --- man/man4/zfs.4 | 6 +++--- module/zfs/ddt.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index d6ed6f8e60cc..12cb3048390f 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1038,8 +1038,8 @@ completes under this time. .It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 100 Ns Pq uint Flush at least this many entries each transaction. .Pp -OpenZFS will a fraction of the log every TXG, to keep the size proportional -to the ingest rate (see +OpenZFS will flush a fraction of the log every TXG, to keep the size +proportional to the ingest rate (see .Sy zfs_dedup_log_flush_txgs ) . This sets the minimum for that estimate, which prevents the backlog from never completely draining if the ingest rate falls. @@ -1071,7 +1071,7 @@ the flushing to try to bring it back down to the soft cap. Setting it will reduce import times, but will reduce the efficiency of the DDT log, increasing the expected number of IOs required to flush the same amount of data. -.It Sy zfs_dedup_log_hard_cap Ns = Ns Sy 0 Ns | Ns 1 Pq int +.It Sy zfs_dedup_log_hard_cap Ns = Ns Sy 0 Ns | Ns 1 Pq uint Whether to treat the log cap as a firm cap or not. .Pp When set to 0 (the default), the diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index a86cb850fa3f..b294c5f71ca0 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -2784,7 +2784,7 @@ ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW, "Soft cap for the size of the current dedup log"); -ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, INT, ZMOD_RW, +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW, "Whether to use the soft cap as a hard cap"); ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,