From 2da9606aea5a8fd1b710f8c8dd5295da4825e9cd Mon Sep 17 00:00:00 2001 From: Roman Pen Date: Tue, 26 Apr 2016 13:15:35 +0200 Subject: workqueue: fix ghost PENDING flag while doing MQ IO commit 346c09f80459a3ad97df1816d6d606169a51001a upstream. The bug in a workqueue leads to a stalled IO request in MQ ctx->rq_list with the following backtrace: [ 601.347452] INFO: task kworker/u129:5:1636 blocked for more than 120 seconds. [ 601.347574] Tainted: G O 4.4.5-1-storage+ #6 [ 601.347651] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 601.348142] kworker/u129:5 D ffff880803077988 0 1636 2 0x00000000 [ 601.348519] Workqueue: ibnbd_server_fileio_wq ibnbd_dev_file_submit_io_worker [ibnbd_server] [ 601.348999] ffff880803077988 ffff88080466b900 ffff8808033f9c80 ffff880803078000 [ 601.349662] ffff880807c95000 7fffffffffffffff ffffffff815b0920 ffff880803077ad0 [ 601.350333] ffff8808030779a0 ffffffff815b01d5 0000000000000000 ffff880803077a38 [ 601.350965] Call Trace: [ 601.351203] [] ? bit_wait+0x60/0x60 [ 601.351444] [] schedule+0x35/0x80 [ 601.351709] [] schedule_timeout+0x192/0x230 [ 601.351958] [] ? blk_flush_plug_list+0xc7/0x220 [ 601.352208] [] ? ktime_get+0x37/0xa0 [ 601.352446] [] ? bit_wait+0x60/0x60 [ 601.352688] [] io_schedule_timeout+0xa4/0x110 [ 601.352951] [] ? _raw_spin_unlock_irqrestore+0xe/0x10 [ 601.353196] [] bit_wait_io+0x1b/0x70 [ 601.353440] [] __wait_on_bit+0x5d/0x90 [ 601.353689] [] wait_on_page_bit+0xc0/0xd0 [ 601.353958] [] ? autoremove_wake_function+0x40/0x40 [ 601.354200] [] __filemap_fdatawait_range+0xe4/0x140 [ 601.354441] [] filemap_fdatawait_range+0x14/0x30 [ 601.354688] [] filemap_write_and_wait_range+0x3f/0x70 [ 601.354932] [] blkdev_fsync+0x1b/0x50 [ 601.355193] [] vfs_fsync_range+0x49/0xa0 [ 601.355432] [] blkdev_write_iter+0xca/0x100 [ 601.355679] [] __vfs_write+0xaa/0xe0 [ 601.355925] [] vfs_write+0xa9/0x1a0 [ 601.356164] [] kernel_write+0x38/0x50 The underlying device is a null_blk, with default parameters: queue_mode = MQ submit_queues = 1 Verification that nullb0 has something inflight: root@pserver8:~# cat /sys/block/nullb0/inflight 0 1 root@pserver8:~# find /sys/block/nullb0/mq/0/cpu* -name rq_list -print -exec cat {} \; ... /sys/block/nullb0/mq/0/cpu2/rq_list CTX pending: ffff8838038e2400 ... During debug it became clear that stalled request is always inserted in the rq_list from the following path: save_stack_trace_tsk + 34 blk_mq_insert_requests + 231 blk_mq_flush_plug_list + 281 blk_flush_plug_list + 199 wait_on_page_bit + 192 __filemap_fdatawait_range + 228 filemap_fdatawait_range + 20 filemap_write_and_wait_range + 63 blkdev_fsync + 27 vfs_fsync_range + 73 blkdev_write_iter + 202 __vfs_write + 170 vfs_write + 169 kernel_write + 56 So blk_flush_plug_list() was called with from_schedule == true. If from_schedule is true, that means that finally blk_mq_insert_requests() offloads execution of __blk_mq_run_hw_queue() and uses kblockd workqueue, i.e. it calls kblockd_schedule_delayed_work_on(). That means, that we race with another CPU, which is about to execute __blk_mq_run_hw_queue() work. Further debugging shows the following traces from different CPUs: CPU#0 CPU#1 ---------------------------------- ------------------------------- reqeust A inserted STORE hctx->ctx_map[0] bit marked kblockd_schedule...() returns 1 request B inserted STORE hctx->ctx_map[1] bit marked kblockd_schedule...() returns 0 *** WORK PENDING bit is cleared *** flush_busy_ctxs() is executed, but bit 1, set by CPU#1, is not observed As a result request B pended forever. This behaviour can be explained by speculative LOAD of hctx->ctx_map on CPU#0, which is reordered with clear of PENDING bit and executed _before_ actual STORE of bit 1 on CPU#1. The proper fix is an explicit full barrier , which guarantees that clear of PENDING bit is to be executed before all possible speculative LOADS or STORES inside actual work function. Signed-off-by: Roman Pen Cc: Gioh Kim Cc: Michael Wang Cc: Tejun Heo Cc: Jens Axboe Cc: linux-block@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 450c21fd0e6e..0ec05948a97b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -649,6 +649,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + /* + * The following mb guarantees that previous clear of a PENDING bit + * will not be reordered with any speculative LOADS or STORES from + * work->current_func, which is executed afterwards. This possible + * reordering can lead to a missed execution on attempt to qeueue + * the same @work. E.g. consider this case: + * + * CPU#0 CPU#1 + * ---------------------------- -------------------------------- + * + * 1 STORE event_indicated + * 2 queue_work_on() { + * 3 test_and_set_bit(PENDING) + * 4 } set_..._and_clear_pending() { + * 5 set_work_data() # clear bit + * 6 smp_mb() + * 7 work->current_func() { + * 8 LOAD event_indicated + * } + * + * Without an explicit full barrier speculative LOAD on line 8 can + * be executed before CPU#0 does STORE on line 1. If that happens, + * CPU#0 observes the PENDING bit is still set and new execution of + * a @work is not queued in a hope, that CPU#1 will eventually + * finish the queued @work. Meanwhile CPU#1 does not see + * event_indicated is set, because speculative LOAD was executed + * before actual STORE. + */ + smp_mb(); } static void clear_work_data(struct work_struct *work) -- cgit v1.2.3 From cf73d8ad76e4555a45ee399887b7c0361354d10f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 May 2016 17:55:18 +0800 Subject: workqueue: fix rebind bound workers warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f7c17d26f43d5cc1b7a6b896cd2fa24a079739b9 upstream. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 16 at kernel/workqueue.c:4559 rebind_workers+0x1c0/0x1d0 Modules linked in: CPU: 0 PID: 16 Comm: cpuhp/0 Not tainted 4.6.0-rc4+ #31 Hardware name: IBM IBM System x3550 M4 Server -[7914IUW]-/00Y8603, BIOS -[D7E128FUS-1.40]- 07/23/2013 0000000000000000 ffff881037babb58 ffffffff8139d885 0000000000000010 0000000000000000 0000000000000000 0000000000000000 ffff881037babba8 ffffffff8108505d ffff881037ba0000 000011cf3e7d6e60 0000000000000046 Call Trace: dump_stack+0x89/0xd4 __warn+0xfd/0x120 warn_slowpath_null+0x1d/0x20 rebind_workers+0x1c0/0x1d0 workqueue_cpu_up_callback+0xf5/0x1d0 notifier_call_chain+0x64/0x90 ? trace_hardirqs_on_caller+0xf2/0x220 ? notify_prepare+0x80/0x80 __raw_notifier_call_chain+0xe/0x10 __cpu_notify+0x35/0x50 notify_down_prepare+0x5e/0x80 ? notify_prepare+0x80/0x80 cpuhp_invoke_callback+0x73/0x330 ? __schedule+0x33e/0x8a0 cpuhp_down_callbacks+0x51/0xc0 cpuhp_thread_fun+0xc1/0xf0 smpboot_thread_fn+0x159/0x2a0 ? smpboot_create_threads+0x80/0x80 kthread+0xef/0x110 ? wait_for_completion+0xf0/0x120 ? schedule_tail+0x35/0xf0 ret_from_fork+0x22/0x50 ? __init_kthread_worker+0x70/0x70 ---[ end trace eb12ae47d2382d8f ]--- notify_down_prepare: attempt to take down CPU 0 failed This bug can be reproduced by below config w/ nohz_full= all cpus: CONFIG_BOOTPARAM_HOTPLUG_CPU0=y CONFIG_DEBUG_HOTPLUG_CPU0=y CONFIG_NO_HZ_FULL=y As Thomas pointed out: | If a down prepare callback fails, then DOWN_FAILED is invoked for all | callbacks which have successfully executed DOWN_PREPARE. | | But, workqueue has actually two notifiers. One which handles | UP/DOWN_FAILED/ONLINE and one which handles DOWN_PREPARE. | | Now look at the priorities of those callbacks: | | CPU_PRI_WORKQUEUE_UP = 5 | CPU_PRI_WORKQUEUE_DOWN = -5 | | So the call order on DOWN_PREPARE is: | | CB 1 | CB ... | CB workqueue_up() -> Ignores DOWN_PREPARE | CB ... | CB X ---> Fails | | So we call up to CB X with DOWN_FAILED | | CB 1 | CB ... | CB workqueue_up() -> Handles DOWN_FAILED | CB ... | CB X-1 | | So the problem is that the workqueue stuff handles DOWN_FAILED in the up | callback, while it should do it in the down callback. Which is not a good idea | either because it wants to be called early on rollback... | | Brilliant stuff, isn't it? The hotplug rework will solve this problem because | the callbacks become symetric, but for the existing mess, we need some | workaround in the workqueue code. The boot CPU handles housekeeping duty(unbound timers, workqueues, timekeeping, ...) on behalf of full dynticks CPUs. It must remain online when nohz full is enabled. There is a priority set to every notifier_blocks: workqueue_cpu_up > tick_nohz_cpu_down > workqueue_cpu_down So tick_nohz_cpu_down callback failed when down prepare cpu 0, and notifier_blocks behind tick_nohz_cpu_down will not be called any more, which leads to workers are actually not unbound. Then hotplug state machine will fallback to undo and online cpu 0 again. Workers will be rebound unconditionally even if they are not unbound and trigger the warning in this progress. This patch fix it by catching !DISASSOCIATED to avoid rebind bound workers. Cc: Tejun Heo Cc: Lai Jiangshan Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Frédéric Weisbecker Suggested-by: Lai Jiangshan Signed-off-by: Wanpeng Li Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0ec05948a97b..2c2f971f3e75 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4457,6 +4457,17 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + + /* + * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED + * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is + * being reworked and this can go away in time. + */ + if (!(pool->flags & POOL_DISASSOCIATED)) { + spin_unlock_irq(&pool->lock); + return; + } + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { -- cgit v1.2.3