diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 56e972a8d48b..1e59d1eb37a2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3349,6 +3349,21 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that this also can be controlled per-workqueue for workqueues visible under /sys/bus/workqueue/. + workqueue.power_efficient + Per-cpu workqueues are generally preferred because + they show better performance thanks to cache + locality; unfortunately, per-cpu workqueues tend to + be more power hungry than unbound workqueues. + + Enabling this makes the per-cpu workqueues which + were observed to contribute significantly to power + consumption unbound, leading to measurably lower + power usage at the cost of small performance + overhead. + + The default value of this parameter is determined by + the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT. + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9c4bb8266bc8..4464c823cff2 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -144,7 +144,8 @@ void put_io_context(struct io_context *ioc) if (atomic_long_dec_and_test(&ioc->refcount)) { spin_lock_irqsave(&ioc->lock, flags); if (!hlist_empty(&ioc->icq_list)) - schedule_work(&ioc->release_work); + queue_work(system_power_efficient_wq, + &ioc->release_work); else free_ioc = true; spin_unlock_irqrestore(&ioc->lock, flags); diff --git a/block/genhd.c b/block/genhd.c index 6f612a747810..6acda145311f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1506,9 +1506,11 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now) intv = disk_events_poll_jiffies(disk); set_timer_slack(&ev->dwork.timer, intv / 4); if (check_now) - queue_delayed_work(system_freezable_wq, &ev->dwork, 0); + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); else if (intv) - queue_delayed_work(system_freezable_wq, &ev->dwork, intv); + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); out_unlock: spin_unlock_irqrestore(&ev->lock, flags); } @@ -1551,7 +1553,8 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) spin_lock_irq(&ev->lock); ev->clearing |= mask; if (!ev->block) - mod_delayed_work(system_freezable_wq, &ev->dwork, 0); + mod_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); spin_unlock_irq(&ev->lock); } @@ -1644,7 +1647,8 @@ static void disk_check_events(struct disk_events *ev, intv = disk_events_poll_jiffies(disk); if (!ev->block && intv) - queue_delayed_work(system_freezable_wq, &ev->dwork, intv); + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); spin_unlock_irq(&ev->lock); diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c index f0493028fc75..cc0c3b1491d9 100644 --- a/drivers/base/firmware_class.c +++ b/drivers/base/firmware_class.c @@ -1006,7 +1006,8 @@ static int _request_firmware_load(struct firmware_priv *fw_priv, bool uevent, dev_set_uevent_suppress(f_dev, false); dev_dbg(f_dev, "firmware: requesting %s\n", buf->fw_id); if (timeout != MAX_SCHEDULE_TIMEOUT) - schedule_delayed_work(&fw_priv->timeout_work, timeout); + queue_delayed_work(system_power_efficient_wq, + &fw_priv->timeout_work, timeout); kobject_uevent(&fw_priv->dev.kobj, KOBJ_ADD); } @@ -1699,8 +1700,8 @@ static void device_uncache_fw_images_work(struct work_struct *work) */ static void device_uncache_fw_images_delay(unsigned long delay) { - schedule_delayed_work(&fw_cache.work, - msecs_to_jiffies(delay)); + queue_delayed_work(system_power_efficient_wq, &fw_cache.work, + msecs_to_jiffies(delay)); } static int fw_pm_notify(struct notifier_block *notify_block, diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 38f0b312ff85..663d2d0448b7 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -439,7 +439,7 @@ void phy_start_machine(struct phy_device *phydev, { phydev->adjust_state = handler; - schedule_delayed_work(&phydev->state_queue, HZ); + queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, HZ); } /** @@ -500,7 +500,7 @@ static irqreturn_t phy_interrupt(int irq, void *phy_dat) disable_irq_nosync(irq); atomic_inc(&phydev->irq_disable); - schedule_work(&phydev->phy_queue); + queue_work(system_power_efficient_wq, &phydev->phy_queue); return IRQ_HANDLED; } @@ -655,7 +655,7 @@ static void phy_change(struct work_struct *work) /* reschedule state queue work to run as soon as possible */ cancel_delayed_work_sync(&phydev->state_queue); - schedule_delayed_work(&phydev->state_queue, 0); + queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0); return; @@ -918,7 +918,8 @@ void phy_state_machine(struct work_struct *work) if (err < 0) phy_error(phydev); - schedule_delayed_work(&phydev->state_queue, PHY_STATE_TIME * HZ); + queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, + PHY_STATE_TIME * HZ); } static inline void mmd_phy_indirect(struct mii_bus *bus, int prtad, int devad, diff --git a/drivers/power/smb135x-charger.c b/drivers/power/smb135x-charger.c index 4048c0a4b3a1..226e3d5756c5 100644 --- a/drivers/power/smb135x-charger.c +++ b/drivers/power/smb135x-charger.c @@ -1911,7 +1911,8 @@ static int smb135x_battery_set_property(struct power_supply *psy, smb_stay_awake(&chip->smb_wake_source); chip->bms_check = 1; cancel_delayed_work(&chip->heartbeat_work); - schedule_delayed_work(&chip->heartbeat_work, + queue_delayed_work(system_power_efficient_wq, + &chip->heartbeat_work, msecs_to_jiffies(0)); break; case POWER_SUPPLY_PROP_HEALTH: @@ -1921,7 +1922,8 @@ static int smb135x_battery_set_property(struct power_supply *psy, smb135x_set_chrg_path_temp(chip); chip->temp_check = 1; cancel_delayed_work(&chip->heartbeat_work); - schedule_delayed_work(&chip->heartbeat_work, + queue_delayed_work(system_power_efficient_wq, + &chip->heartbeat_work, msecs_to_jiffies(0)); break; /* Block from Fuel Gauge */ @@ -2598,7 +2600,8 @@ static void aicl_check_work(struct work_struct *work) chip->aicl_weak_detect = true; cancel_delayed_work(&chip->src_removal_work); - schedule_delayed_work(&chip->src_removal_work, + queue_delayed_work(system_power_efficient_wq, + &chip->src_removal_work, msecs_to_jiffies(3000)); if (!rc) { dev_dbg(chip->dev, "Reached Bottom IC!\n"); @@ -2678,8 +2681,9 @@ static void rate_check_work(struct work_struct *work) chip->rate_check_count++; if (chip->rate_check_count < 6) - schedule_delayed_work(&chip->rate_check_work, - msecs_to_jiffies(500)); + queue_delayed_work(system_power_efficient_wq, + &chip->rate_check_work, + msecs_to_jiffies(500)); } static void usb_insertion_work(struct work_struct *work) @@ -2734,8 +2738,9 @@ static void heartbeat_work(struct work_struct *work) smb135x_get_prop_batt_health(chip, &batt_health)) { dev_warn(chip->dev, "HB Failed to run resume = %d!\n", (int)chip->resume_completed); - schedule_delayed_work(&chip->heartbeat_work, - msecs_to_jiffies(1000)); + queue_delayed_work(system_power_efficient_wq, + &chip->heartbeat_work, + msecs_to_jiffies(1000)); return; } @@ -2833,8 +2838,9 @@ static void heartbeat_work(struct work_struct *work) power_supply_changed(&chip->batt_psy); - schedule_delayed_work(&chip->heartbeat_work, - msecs_to_jiffies(60000)); + queue_delayed_work(system_power_efficient_wq, + &chip->heartbeat_work, + msecs_to_jiffies(60000)); chip->hb_running = false; if (!usb_present && !dc_present) smb_relax(&chip->smb_wake_source); @@ -2945,7 +2951,8 @@ static int otg_oc_handler(struct smb135x_chg *chip, u8 rt_stat) return 0; } - schedule_delayed_work(&chip->ocp_clear_work, + queue_delayed_work(system_power_efficient_wq, + &chip->ocp_clear_work, msecs_to_jiffies(0)); pr_err("rt_stat = 0x%02x\n", rt_stat); @@ -2970,7 +2977,8 @@ static int handle_dc_removal(struct smb135x_chg *chip) static int handle_dc_insertion(struct smb135x_chg *chip) { if (chip->dc_psy_type == POWER_SUPPLY_TYPE_WIRELESS) - schedule_delayed_work(&chip->wireless_insertion_work, + queue_delayed_work(system_power_efficient_wq, + &chip->wireless_insertion_work, msecs_to_jiffies(DCIN_UNSUSPEND_DELAY_MS)); if (chip->dc_psy_type != -EINVAL) power_supply_set_online(&chip->dc_psy, @@ -3121,8 +3129,9 @@ static int handle_usb_insertion(struct smb135x_chg *chip) smb_stay_awake(&chip->smb_wake_source); chip->apsd_rerun_cnt++; chip->usb_present = 0; - schedule_delayed_work(&chip->usb_insertion_work, - msecs_to_jiffies(1000)); + queue_delayed_work(system_power_efficient_wq, + &chip->usb_insertion_work, + msecs_to_jiffies(1000)); return 0; } @@ -3162,8 +3171,9 @@ static int handle_usb_insertion(struct smb135x_chg *chip) chip->charger_rate = POWER_SUPPLY_CHARGE_RATE_NORMAL; chip->rate_check_count = 0; cancel_delayed_work(&chip->rate_check_work); - schedule_delayed_work(&chip->rate_check_work, - msecs_to_jiffies(500)); + queue_delayed_work(system_power_efficient_wq, + &chip->rate_check_work, + msecs_to_jiffies(500)); return 0; } @@ -3204,8 +3214,9 @@ static int usbin_uv_handler(struct smb135x_chg *chip, u8 rt_stat) if (rc < 0) pr_err("Failed to Disable USBIN UV IRQ\n"); cancel_delayed_work(&chip->aicl_check_work); - schedule_delayed_work(&chip->aicl_check_work, - msecs_to_jiffies(0)); + queue_delayed_work(system_power_efficient_wq, + &chip->aicl_check_work, + msecs_to_jiffies(0)); } return 0; } @@ -5435,8 +5446,9 @@ static int smb135x_charger_probe(struct i2c_client *client, if (rc < 0) pr_err("failed to set up voltage notifications: %d\n", rc); - schedule_delayed_work(&chip->heartbeat_work, - msecs_to_jiffies(60000)); + queue_delayed_work(system_power_efficient_wq, + &chip->heartbeat_work, + msecs_to_jiffies(60000)); dev_info(chip->dev, "SMB135X version = %s revision = %s successfully probed batt=%d dc = %d usb = %d\n", version_str[chip->version], diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c index bdc00a196659..7423b8335622 100644 --- a/drivers/regulator/core.c +++ b/drivers/regulator/core.c @@ -1925,8 +1925,9 @@ int regulator_disable_deferred(struct regulator *regulator, int ms) rdev->deferred_disables++; mutex_unlock(&rdev->mutex); - ret = schedule_delayed_work(&rdev->disable_work, - msecs_to_jiffies(ms)); + ret = queue_delayed_work(system_power_efficient_wq, + &rdev->disable_work, + msecs_to_jiffies(ms)); if (ret < 0) return ret; else diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 96c82a98cd50..b05d7fb135ac 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -506,7 +506,8 @@ static void led_work (struct work_struct *work) changed++; } if (changed) - schedule_delayed_work(&hub->leds, LED_CYCLE_PERIOD); + queue_delayed_work(system_power_efficient_wq, + &hub->leds, LED_CYCLE_PERIOD); } /* use a short timeout for hub/port status fetches */ @@ -1057,7 +1058,8 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) goto init2; #endif PREPARE_DELAYED_WORK(&hub->init_work, hub_init_func2); - schedule_delayed_work(&hub->init_work, + queue_delayed_work(system_power_efficient_wq, + &hub->init_work, msecs_to_jiffies(delay)); /* Suppress autosuspend until init is done */ @@ -1218,7 +1220,8 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) /* Don't do a long sleep inside a workqueue routine */ if (type == HUB_INIT2) { PREPARE_DELAYED_WORK(&hub->init_work, hub_init_func3); - schedule_delayed_work(&hub->init_work, + queue_delayed_work(system_power_efficient_wq, + &hub->init_work, msecs_to_jiffies(delay)); device_unlock(hub->intfdev); return; /* Continues at init3: below */ @@ -1233,7 +1236,8 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) if (status < 0) dev_err(hub->intfdev, "activate --> %d\n", status); if (hub->has_indicators && blinkenlights) - schedule_delayed_work(&hub->leds, LED_CYCLE_PERIOD); + queue_delayed_work(system_power_efficient_wq, + &hub->leds, LED_CYCLE_PERIOD); /* Scan all ports that need attention */ kick_khubd(hub); @@ -4396,7 +4400,8 @@ check_highspeed (struct usb_hub *hub, struct usb_device *udev, int port1) /* hub LEDs are probably harder to miss than syslog */ if (hub->has_indicators) { hub->indicator[port1-1] = INDICATOR_GREEN_BLINK; - schedule_delayed_work (&hub->leds, 0); + queue_delayed_work(system_power_efficient_wq, + &hub->leds, 0); } } kfree(qual); @@ -4626,7 +4631,9 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1, if (hub->has_indicators) { hub->indicator[port1-1] = INDICATOR_AMBER_BLINK; - schedule_delayed_work (&hub->leds, 0); + queue_delayed_work( + system_power_efficient_wq, + &hub->leds, 0); } status = -ENOTCONN; /* Don't retry */ goto loop_disable; diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 623488fdc1f5..33cbe000424b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -71,7 +71,8 @@ enum { /* data contains off-queue information when !WORK_STRUCT_PWQ */ WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, - WORK_OFFQ_CANCELING = (1 << WORK_OFFQ_FLAG_BASE), + __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, + WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING), /* * When a work item is off queue, its high bits point to the last @@ -303,6 +304,33 @@ enum { WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ + /* + * Per-cpu workqueues are generally preferred because they tend to + * show better performance thanks to cache locality. Per-cpu + * workqueues exclude the scheduler from choosing the CPU to + * execute the worker threads, which has an unfortunate side effect + * of increasing power consumption. + * + * The scheduler considers a CPU idle if it doesn't have any task + * to execute and tries to keep idle cores idle to conserve power; + * however, for example, a per-cpu work item scheduled from an + * interrupt handler on an idle CPU will force the scheduler to + * excute the work item on that CPU breaking the idleness, which in + * turn may lead to more scheduling choices which are sub-optimal + * in terms of power consumption. + * + * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default + * but become unbound if workqueue.power_efficient kernel param is + * specified. Per-cpu workqueues which are identified to + * contribute significantly to power-consumption are identified and + * marked with this flag and enabling the power_efficient mode + * leads to noticeable power saving at the cost of small + * performance disadvantage. + * + * http://thread.gmane.org/gmane.linux.kernel/1480396 + */ + WQ_POWER_EFFICIENT = 1 << 7, + __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ @@ -333,11 +361,19 @@ enum { * * system_freezable_wq is equivalent to system_wq except that it's * freezable. + * + * *_power_efficient_wq are inclined towards saving power and converted + * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, + * they are same as their non-power-efficient counterparts - e.g. + * system_power_efficient_wq is identical to system_wq if + * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; +extern struct workqueue_struct *system_power_efficient_wq; +extern struct workqueue_struct *system_freezable_power_efficient_wq; static inline struct workqueue_struct * __deprecated __system_nrt_wq(void) { diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index c584a95ae531..4b8be16f853f 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -271,6 +271,26 @@ config PM_GENERIC_DOMAINS bool depends on PM +config WQ_POWER_EFFICIENT_DEFAULT + bool "Enable workqueue power-efficient mode by default" + depends on PM + default n + help + Per-cpu workqueues are generally preferred because they show + better performance thanks to cache locality; unfortunately, + per-cpu workqueues tend to be more power hungry than unbound + workqueues. + + Enabling workqueue.power_efficient kernel parameter makes the + per-cpu workqueues which were observed to contribute + significantly to power consumption unbound, leading to measurably + lower power usage at the cost of small performance overhead. + + This config option determines whether workqueue.power_efficient + is enabled by default. + + If in doubt, say N. + config PM_GENERIC_DOMAINS_SLEEP def_bool y depends on PM_SLEEP && PM_GENERIC_DOMAINS diff --git a/kernel/srcu.c b/kernel/srcu.c index 01d5ccb8bfe3..27b654950cc8 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -375,7 +375,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; - schedule_delayed_work(&sp->work, 0); + queue_delayed_work(system_power_efficient_wq, &sp->work, 0); } spin_unlock_irqrestore(&sp->queue_lock, flags); } @@ -631,7 +631,8 @@ static void srcu_reschedule(struct srcu_struct *sp) } if (pending) - schedule_delayed_work(&sp->work, SRCU_INTERVAL); + queue_delayed_work(system_power_efficient_wq, + &sp->work, SRCU_INTERVAL); } /* diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index af8d1d4f3d55..419a52cecd20 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -514,12 +514,13 @@ static void sync_cmos_clock(struct work_struct *work) next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } - schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); + queue_delayed_work(system_power_efficient_wq, + &sync_cmos_work, timespec_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) { - schedule_delayed_work(&sync_cmos_work, 0); + queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4611cb80e1a4..70a526972c61 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -273,6 +273,15 @@ static cpumask_var_t *wq_numa_possible_cpumask; static bool wq_disable_numa; module_param_named(disable_numa, wq_disable_numa, bool, 0444); +/* see the comment above the definition of WQ_POWER_EFFICIENT */ +#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT +static bool wq_power_efficient = true; +#else +static bool wq_power_efficient; +#endif + +module_param_named(power_efficient, wq_power_efficient, bool, 0444); + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -309,6 +318,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly; EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +struct workqueue_struct *system_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_power_efficient_wq); +struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, @@ -1451,13 +1464,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; + /* timer isn't guaranteed to run in this cpu, record earlier */ + if (cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - if (unlikely(cpu != WORK_CPU_UNBOUND)) - add_timer_on(timer, cpu); - else - add_timer(timer); + add_timer_on(timer, cpu); } /** @@ -1929,17 +1942,13 @@ static void pool_mayday_timeout(unsigned long __pool) * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. - * - * RETURNS: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. */ -static bool maybe_create_worker(struct worker_pool *pool) +static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { if (!need_to_create_worker(pool)) - return false; + return; restart: spin_unlock_irq(&pool->lock); @@ -1956,7 +1965,7 @@ __acquires(&pool->lock) start_worker(worker); if (WARN_ON_ONCE(need_to_create_worker(pool))) goto restart; - return true; + return; } if (!need_to_create_worker(pool)) @@ -1973,7 +1982,7 @@ __acquires(&pool->lock) spin_lock_irq(&pool->lock); if (need_to_create_worker(pool)) goto restart; - return true; + return; } /** @@ -1986,15 +1995,9 @@ __acquires(&pool->lock) * LOCKING: * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Called only from manager. - * - * RETURNS: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. */ -static bool maybe_destroy_workers(struct worker_pool *pool) +static void maybe_destroy_workers(struct worker_pool *pool) { - bool ret = false; - while (too_many_workers(pool)) { struct worker *worker; unsigned long expires; @@ -2008,10 +2011,7 @@ static bool maybe_destroy_workers(struct worker_pool *pool) } destroy_worker(worker); - ret = true; } - - return ret; } /** @@ -2031,13 +2031,14 @@ static bool maybe_destroy_workers(struct worker_pool *pool) * multiple times. Does GFP_KERNEL allocations. * * RETURNS: - * spin_lock_irq(pool->lock) which may be released and regrabbed - * multiple times. Does GFP_KERNEL allocations. + * %false if the pool doesn't need management and the caller can safely + * start processing works, %true if management function was performed and + * the conditions that the caller verified before calling the function may + * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - bool ret = false; /* * Managership is governed by two mutexes - manager_arb and @@ -2061,7 +2062,7 @@ static bool manage_workers(struct worker *worker) * manager_mutex. */ if (!mutex_trylock(&pool->manager_arb)) - return ret; + return false; /* * With manager arbitration won, manager_mutex would be free in @@ -2071,7 +2072,6 @@ static bool manage_workers(struct worker *worker) spin_unlock_irq(&pool->lock); mutex_lock(&pool->manager_mutex); spin_lock_irq(&pool->lock); - ret = true; } pool->flags &= ~POOL_MANAGE_WORKERS; @@ -2080,12 +2080,12 @@ static bool manage_workers(struct worker *worker) * Destroy and then create so that may_start_working() is true * on return. */ - ret |= maybe_destroy_workers(pool); - ret |= maybe_create_worker(pool); + maybe_destroy_workers(pool); + maybe_create_worker(pool); mutex_unlock(&pool->manager_mutex); mutex_unlock(&pool->manager_arb); - return ret; + return true; } /** @@ -2853,19 +2853,57 @@ bool flush_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work); +struct cwt_wait { + wait_queue_t wait; + struct work_struct *work; +}; + +static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); + + if (cwait->work != key) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); unsigned long flags; int ret; do { ret = try_to_grab_pending(work, is_dwork, &flags); /* - * If someone else is canceling, wait for the same event it - * would be waiting for before retrying. + * If someone else is already canceling, wait for it to + * finish. flush_work() doesn't work for PREEMPT_NONE + * because we may get scheduled between @work's completion + * and the other canceling task resuming and clearing + * CANCELING - flush_work() will return false immediately + * as @work is no longer busy, try_to_grab_pending() will + * return -ENOENT as @work is still being canceled and the + * other canceling task won't be able to clear CANCELING as + * we're hogging the CPU. + * + * Let's wait for completion using a waitqueue. As this + * may lead to the thundering herd problem, use a custom + * wake function which matches @work along with exclusive + * wait and wakeup. */ - if (unlikely(ret == -ENOENT)) - flush_work(work); + if (unlikely(ret == -ENOENT)) { + struct cwt_wait cwait; + + init_wait(&cwait.wait); + cwait.wait.func = cwt_wakefn; + cwait.work = work; + + prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, + TASK_UNINTERRUPTIBLE); + if (work_is_canceling(work)) + schedule(); + finish_wait(&cancel_waitq, &cwait.wait); + } } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ @@ -2874,6 +2912,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) flush_work(work); clear_work_data(work); + + /* + * Paired with prepare_to_wait() above so that either + * waitqueue_active() is visible here or !work_is_canceling() is + * visible there. + */ + smp_mb(); + if (waitqueue_active(&cancel_waitq)) + __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); + return ret; } @@ -4126,6 +4174,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ + if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) + flags |= WQ_UNBOUND; + /* allocate wq and format name */ if (flags & WQ_UNBOUND) tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); @@ -4569,10 +4621,13 @@ static void wq_unbind_fn(struct work_struct *work) /** * rebind_workers - rebind all workers of a pool to the associated CPU * @pool: pool of interest + * @force: if it is true, replace WORKER_UNBOUND with WORKER_REBOUND + * irrespective of flags of workers. Otherwise, replace the flags only + * when workers have WORKER_UNBOUND flag. * * @pool->cpu is coming online. Rebind all workers to the CPU. */ -static void rebind_workers(struct worker_pool *pool) +static void rebind_workers(struct worker_pool *pool, bool force) { struct worker *worker; int wi; @@ -4591,6 +4646,7 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, wi, pool) { unsigned int worker_flags = worker->flags; @@ -4621,10 +4677,12 @@ static void rebind_workers(struct worker_pool *pool) * fail incorrectly leading to premature concurrency * management operations. */ - WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); - worker_flags |= WORKER_REBOUND; - worker_flags &= ~WORKER_UNBOUND; - ACCESS_ONCE(worker->flags) = worker_flags; + if (force || (worker_flags & WORKER_UNBOUND)) { + WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); + worker_flags |= WORKER_REBOUND; + worker_flags &= ~WORKER_UNBOUND; + ACCESS_ONCE(worker->flags) = worker_flags; + } } spin_unlock_irq(&pool->lock); @@ -4693,15 +4751,12 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, for_each_pool(pool, pi) { mutex_lock(&pool->manager_mutex); - if (pool->cpu == cpu) { - spin_lock_irq(&pool->lock); - pool->flags &= ~POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); - - rebind_workers(pool); - } else if (pool->cpu < 0) { + if (pool->cpu == cpu) + rebind_workers(pool, + (action & ~CPU_TASKS_FROZEN) + != CPU_DOWN_FAILED); + else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - } mutex_unlock(&pool->manager_mutex); } @@ -5035,8 +5090,15 @@ static int __init init_workqueues(void) WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); + system_power_efficient_wq = alloc_workqueue("events_power_efficient", + WQ_POWER_EFFICIENT, 0); + system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient", + WQ_FREEZABLE | WQ_POWER_EFFICIENT, + 0); BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || - !system_unbound_wq || !system_freezable_wq); + !system_unbound_wq || !system_freezable_wq || + !system_power_efficient_wq || + !system_freezable_power_efficient_wq); return 0; } early_initcall(init_workqueues); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index cdd77161e99f..7aa5aa1f2fa9 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -826,7 +826,7 @@ static void neigh_periodic_work(struct work_struct *work) * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 * base_reachable_time. */ - schedule_delayed_work(&tbl->gc_work, + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.base_reachable_time >> 1); write_unlock_bh(&tbl->lock); } @@ -1544,7 +1544,8 @@ static void neigh_table_init_no_netlink(struct neigh_table *tbl) rwlock_init(&tbl->lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); - schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); + queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, + tbl->parms.reachable_time); setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index b151e0ac7f27..5b85c29d7d98 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -469,7 +469,7 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&check_lifetime_work); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that @@ -678,7 +678,8 @@ static void check_lifetime(struct work_struct *work) if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; - schedule_delayed_work(&check_lifetime_work, next_sched - now); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, + next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, @@ -834,7 +835,8 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) ifa = ifa_existing; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&check_lifetime_work); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, + &check_lifetime_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); } @@ -2299,7 +2301,7 @@ void __init devinet_init(void) register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); - schedule_delayed_work(&check_lifetime_work, 0); + queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); rtnl_af_register(&inet_af_ops); diff --git a/net/rfkill/core.c b/net/rfkill/core.c index c099b4fffd93..9e94dcabfee1 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -800,7 +800,8 @@ void rfkill_resume_polling(struct rfkill *rfkill) if (!rfkill->ops->poll) return; - schedule_work(&rfkill->poll_work.work); + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, 0); } EXPORT_SYMBOL(rfkill_resume_polling); @@ -908,7 +909,8 @@ static void rfkill_poll(struct work_struct *work) */ rfkill->ops->poll(rfkill, rfkill->data); - schedule_delayed_work(&rfkill->poll_work, + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); } @@ -972,7 +974,8 @@ int __must_check rfkill_register(struct rfkill *rfkill) INIT_WORK(&rfkill->sync_work, rfkill_sync_work); if (rfkill->ops->poll) - schedule_delayed_work(&rfkill->poll_work, + queue_delayed_work(system_power_efficient_wq, + &rfkill->poll_work, round_jiffies_relative(POLL_INTERVAL)); if (!rfkill->persistent || rfkill_epo_lock_active) { diff --git a/net/wireless/reg.c b/net/wireless/reg.c index a45d07a4b47a..f515d2bde5d0 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1581,8 +1581,8 @@ static void reg_process_hint(struct regulatory_request *reg_request, break; default: if (reg_initiator == NL80211_REGDOM_SET_BY_USER) - schedule_delayed_work(®_timeout, - msecs_to_jiffies(3142)); + queue_delayed_work(system_power_efficient_wq, + ®_timeout, msecs_to_jiffies(3142)); break; } } @@ -2184,7 +2184,8 @@ static int __set_regdom(const struct ieee80211_regdomain *rd) if (!request_wiphy && (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER || lr->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)) { - schedule_delayed_work(®_timeout, 0); + queue_delayed_work(system_power_efficient_wq, + ®_timeout, 0); return -ENODEV; } diff --git a/sound/soc/soc-compress.c b/sound/soc/soc-compress.c index 91a24bf07dad..4e20ea709cc6 100644 --- a/sound/soc/soc-compress.c +++ b/sound/soc/soc-compress.c @@ -246,8 +246,9 @@ static int soc_compr_free(struct snd_compr_stream *cstream) SND_SOC_DAPM_STREAM_STOP); } else { rtd->pop_wait = 1; - schedule_delayed_work(&rtd->delayed_work, - msecs_to_jiffies(rtd->pmdown_time)); + queue_delayed_work(system_power_efficient_wq, + &rtd->delayed_work, + msecs_to_jiffies(rtd->pmdown_time)); } } else { /* capture streams can be powered down now */ diff --git a/sound/soc/soc-jack.c b/sound/soc/soc-jack.c index 346991e39a14..b7973e494fa3 100644 --- a/sound/soc/soc-jack.c +++ b/sound/soc/soc-jack.c @@ -280,7 +280,7 @@ static irqreturn_t gpio_handler(int irq, void *data) if (device_may_wakeup(dev)) pm_wakeup_event(dev, gpio->debounce_time + 50); - schedule_delayed_work(&gpio->work, + queue_delayed_work(system_power_efficient_wq, &gpio->work, msecs_to_jiffies(gpio->debounce_time)); return IRQ_HANDLED; diff --git a/sound/soc/soc-pcm.c b/sound/soc/soc-pcm.c index 42015dbc46b4..1699e62aaf9f 100644 --- a/sound/soc/soc-pcm.c +++ b/sound/soc/soc-pcm.c @@ -426,8 +426,9 @@ static int soc_pcm_close(struct snd_pcm_substream *substream) } else { /* start delayed pop wq here for playback streams */ rtd->pop_wait = 1; - schedule_delayed_work(&rtd->delayed_work, - msecs_to_jiffies(rtd->pmdown_time)); + queue_delayed_work(system_power_efficient_wq, + &rtd->delayed_work, + msecs_to_jiffies(rtd->pmdown_time)); } } else { /* capture streams can be powered down now */