From 8d7dc9283f399e1fda4e48a1c453f689326d9396 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 14 Apr 2015 19:33:59 -0700
Subject: rcu: Control grace-period delays directly from value

In a misguided attempt to avoid an #ifdef, the use of the
gp_init_delay module parameter was conditioned on the corresponding
RCU_TORTURE_TEST_SLOW_INIT Kconfig variable, using IS_ENABLED() at
the point of use in the code.  This meant that the compiler always saw
the delay, which meant that RCU_TORTURE_TEST_SLOW_INIT_DELAY had to be
unconditionally defined.  This in turn caused "make oldconfig" to ask
pointless questions about the value of RCU_TORTURE_TEST_SLOW_INIT_DELAY
in cases where it was not even used.

This commit avoids these pointless questions by defining gp_init_delay
under #ifdef.  In one branch, gp_init_delay is initialized to
RCU_TORTURE_TEST_SLOW_INIT_DELAY and is also a module parameter (thus
allowing boot-time modification), and in the other branch gp_init_delay
is a const variable initialized by default to zero.

This approach also simplifies the code at the delay point by eliminating
the IS_DEFINED().  Because gp_init_delay is constant zero in the no-delay
case intended for production use, the "gp_init_delay > 0" check causes
the delay to become dead code, as desired in this case.  In addition,
this commit replaces magic constant "10" with the preprocessor variable
PER_RCU_NODE_PERIOD, which controls the number of grace periods that
are allowed to elapse at full speed before a delay is inserted.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by:
Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 233165da782f..8cf7304b2867 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -162,11 +162,14 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
 module_param(kthread_prio, int, 0644);
 
-/* Delay in jiffies for grace-period initialization delays. */
-static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
-				? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
-				: 0;
+/* Delay in jiffies for grace-period initialization delays, debug only. */
+#ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT
+static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY;
 module_param(gp_init_delay, int, 0644);
+#else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+static const int gp_init_delay;
+#endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */
+#define PER_RCU_NODE_PERIOD 10	/* Number of grace periods between delays. */
 
 /*
  * Track the rcutorture test sequence number and the update version
@@ -1843,9 +1846,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
 		ACCESS_ONCE(rsp->gp_activity) = jiffies;
-		if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
-		    gp_init_delay > 0 &&
-		    !(rsp->gpnum % (rcu_num_nodes * 10)))
+		if (gp_init_delay > 0 &&
+		    !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD)))
 			schedule_timeout_uninterruptible(gp_init_delay);
 	}
 
-- 
cgit v1.2.3


From 7e01b5acd88b3f3108d8c4ce44e3205d67437202 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Thu, 16 Apr 2015 14:47:33 +0200
Subject: kexec: allocate the kexec control page with KEXEC_CONTROL_MEMORY_GFP

Introduce KEXEC_CONTROL_MEMORY_GFP to allow the architecture code
to override the gfp flags of the allocation for the kexec control
page. The loop in kimage_alloc_normal_control_pages allocates pages
with GFP_KERNEL until a page is found that happens to have an
address smaller than the KEXEC_CONTROL_MEMORY_LIMIT. On systems
with a large memory size but a small KEXEC_CONTROL_MEMORY_LIMIT
the loop will keep allocating memory until the oom killer steps in.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 kernel/kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 38c25b1f2fd5..7a36fdcca5bf 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -707,7 +707,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 	do {
 		unsigned long pfn, epfn, addr, eaddr;
 
-		pages = kimage_alloc_pages(GFP_KERNEL, order);
+		pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
 		if (!pages)
 			break;
 		pfn   = page_to_pfn(pages);
-- 
cgit v1.2.3


From 10a50f1ab5f06c9a3ee5ece3ec52e607ed53c79f Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Wed, 15 Apr 2015 11:14:11 +0300
Subject: genirq: Set IRQCHIP_SKIP_SET_WAKE flag for dummy_irq_chip

Without this system suspend is broken on systems that have
drivers calling enable/disable_irq_wake() for interrupts based off
the dummy irq hook. (e.g. drivers/gpio/gpio-pcf857x.c)

Signed-off-by: Roger Quadros <rogerq@ti.com>
Cc: <cw00.choi@samsung.com>
Cc: <balbi@ti.com>
Cc: <tony@atomide.com>
Cc: Gregory Clement <gregory.clement@free-electrons.com>
Link: http://lkml.kernel.org/r/552E1DD3.4040106@ti.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/dummychip.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 988dc58e8847..2feb6feca0cc 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -57,5 +57,6 @@ struct irq_chip dummy_irq_chip = {
 	.irq_ack	= noop,
 	.irq_mask	= noop,
 	.irq_unmask	= noop,
+	.flags		= IRQCHIP_SKIP_SET_WAKE,
 };
 EXPORT_SYMBOL_GPL(dummy_irq_chip);
-- 
cgit v1.2.3


From 149aabcc44e3e2c1f8fe4f0832be53d2db55b598 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 10 Apr 2015 12:56:41 +0530
Subject: clockevents: Shutdown detached clockevent device

A clockevent device is marked DETACHED when it is replaced by another
clockevent device.

The device is shutdown properly for drivers that implement legacy
->set_mode() callback, as we call ->set_mode() for CLOCK_EVT_MODE_UNUSED
as well.

But for the new per-state callback interface, we skip shutting down the
device, as we thought its an internal state change. That wasn't correct.

The effect is that the device is left programmed in oneshot or periodic
mode.

Fall-back to 'case CLOCK_EVT_STATE_SHUTDOWN', to shutdown the device.

Fixes: bd624d75db21 "clockevents: Introduce mode specific callbacks"
Reported-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/eef0a91c51b74d4e52c8e5a95eca27b5a0563f07.1428650683.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 25d942d1da27..629be4e21e96 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -117,11 +117,7 @@ static int __clockevents_set_state(struct clock_event_device *dev,
 	/* Transition with new state-specific callbacks */
 	switch (state) {
 	case CLOCK_EVT_STATE_DETACHED:
-		/*
-		 * This is an internal state, which is guaranteed to go from
-		 * SHUTDOWN to DETACHED. No driver interaction required.
-		 */
-		return 0;
+		/* The clockevent device is getting replaced. Shut it down. */
 
 	case CLOCK_EVT_STATE_SHUTDOWN:
 		return dev->set_state_shutdown(dev);
-- 
cgit v1.2.3


From 73459e2a1ada09a68c02cc5b73f3116fc8194b3d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 23 Apr 2015 13:20:18 +0200
Subject: x86: pvclock: Really remove the sched notifier for cross-cpu
 migrations

This reverts commits 0a4e6be9ca17c54817cf814b4b5aa60478c6df27
and 80f7fdb1c7f0f9266421f823964fd1962681f6ce.

The task migration notifier was originally introduced in order to support
the pvclock vsyscall with non-synchronized TSC, but KVM only supports it
with synchronized TSC.  Hence, on KVM the race condition is only needed
due to a bad implementation on the host side, and even then it's so rare
that it's mostly theoretical.

As far as KVM is concerned it's possible to fix the host, avoiding the
additional complexity in the vDSO and the (re)introduction of the task
migration notifier.

Xen, on the other hand, hasn't yet implemented vsyscall support at
all, so we do not care about its plans for non-synchronized TSC.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 kernel/sched/core.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9123a82cbb6..fe22f7510bce 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1016,13 +1016,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 		rq_clock_skip_update(rq, true);
 }
 
-static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
-
-void register_task_migration_notifier(struct notifier_block *n)
-{
-	atomic_notifier_chain_register(&task_migration_notifier, n);
-}
-
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1053,18 +1046,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);
 
 	if (task_cpu(p) != new_cpu) {
-		struct task_migration_notifier tmn;
-
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
-
-		tmn.task = p;
-		tmn.from_cpu = task_cpu(p);
-		tmn.to_cpu = new_cpu;
-
-		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
 	}
 
 	__set_task_cpu(p, new_cpu);
-- 
cgit v1.2.3


From 876a7ae65b86d8cec8efe7d15d050ac61116874e Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Mon, 27 Apr 2015 14:40:37 -0700
Subject: bpf: fix 64-bit divide

ALU64_DIV instruction should be dividing 64-bit by 64-bit,
whereas do_div() does 64-bit by 32-bit divide.
x64 and arm64 JITs correctly implement 64 by 64 unsigned divide.
llvm BPF backend emits code assuming that ALU64_DIV does 64 by 64.

Fixes: 89aa075832b0 ("net: sock: allow eBPF programs to be attached to sockets")
Reported-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 4139a0f8b558..54f0e7fcd0e2 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -357,8 +357,8 @@ select_insn:
 	ALU64_MOD_X:
 		if (unlikely(SRC == 0))
 			return 0;
-		tmp = DST;
-		DST = do_div(tmp, SRC);
+		div64_u64_rem(DST, SRC, &tmp);
+		DST = tmp;
 		CONT;
 	ALU_MOD_X:
 		if (unlikely(SRC == 0))
@@ -367,8 +367,8 @@ select_insn:
 		DST = do_div(tmp, (u32) SRC);
 		CONT;
 	ALU64_MOD_K:
-		tmp = DST;
-		DST = do_div(tmp, IMM);
+		div64_u64_rem(DST, IMM, &tmp);
+		DST = tmp;
 		CONT;
 	ALU_MOD_K:
 		tmp = (u32) DST;
@@ -377,7 +377,7 @@ select_insn:
 	ALU64_DIV_X:
 		if (unlikely(SRC == 0))
 			return 0;
-		do_div(DST, SRC);
+		DST = div64_u64(DST, SRC);
 		CONT;
 	ALU_DIV_X:
 		if (unlikely(SRC == 0))
@@ -387,7 +387,7 @@ select_insn:
 		DST = (u32) tmp;
 		CONT;
 	ALU64_DIV_K:
-		do_div(DST, IMM);
+		DST = div64_u64(DST, IMM);
 		CONT;
 	ALU_DIV_K:
 		tmp = (u32) DST;
-- 
cgit v1.2.3


From df8d9eeadd0f7a216f2476351d5aee43c6550bf0 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 29 Apr 2015 15:19:21 +0200
Subject: cpuidle: Run tick_broadcast_exit() with disabled interrupts

Commit 335f49196fd6 (sched/idle: Use explicit broadcast oneshot
control function) replaced clockevents_notify() invocations in
cpuidle_idle_call() with direct calls to tick_broadcast_enter()
and tick_broadcast_exit(), but it overlooked the fact that
interrupts were already enabled before calling the latter which
led to functional breakage on systems using idle states with the
CPUIDLE_FLAG_TIMER_STOP flag set.

Fix that by moving the invocations of tick_broadcast_enter()
and tick_broadcast_exit() down into cpuidle_enter_state() where
interrupts are still disabled when tick_broadcast_exit() is
called.  Also ensure that interrupts will be disabled before
running tick_broadcast_exit() even if they have been enabled by
the idle state's ->enter callback.  Trigger a WARN_ON_ONCE() in
that case, as we generally don't want that to happen for states
with CPUIDLE_FLAG_TIMER_STOP set.

Fixes: 335f49196fd6 (sched/idle: Use explicit broadcast oneshot control function)
Reported-and-tested-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reported-and-tested-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/idle.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index deef1caa94c6..fefcb1fa5160 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -81,7 +81,6 @@ static void cpuidle_idle_call(void)
 	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
 	int next_state, entered_state;
-	unsigned int broadcast;
 	bool reflect;
 
 	/*
@@ -150,17 +149,6 @@ static void cpuidle_idle_call(void)
 		goto exit_idle;
 	}
 
-	broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
-
-	/*
-	 * Tell the time framework to switch to a broadcast timer
-	 * because our local timer will be shutdown. If a local timer
-	 * is used from another cpu as a broadcast timer, this call may
-	 * fail if it is not available
-	 */
-	if (broadcast && tick_broadcast_enter())
-		goto use_default;
-
 	/* Take note of the planned idle state. */
 	idle_set_state(this_rq(), &drv->states[next_state]);
 
@@ -174,8 +162,8 @@ static void cpuidle_idle_call(void)
 	/* The cpu is no longer idle or about to enter idle. */
 	idle_set_state(this_rq(), NULL);
 
-	if (broadcast)
-		tick_broadcast_exit();
+	if (entered_state == -EBUSY)
+		goto use_default;
 
 	/*
 	 * Give the governor an opportunity to reflect on the outcome
-- 
cgit v1.2.3


From 9c4249c8e0221e5cfae758d35b768aee84abf6c0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 30 Apr 2015 14:58:43 +0100
Subject: modsign: change default key details

Change default key details to be more obviously unspecified.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <james.l.morris@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 0f8f8b0bc1bf..60c302cfb4d3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -197,9 +197,9 @@ x509.genkey:
 	@echo >>x509.genkey "x509_extensions = myexts"
 	@echo >>x509.genkey
 	@echo >>x509.genkey "[ req_distinguished_name ]"
-	@echo >>x509.genkey "O = Magrathea"
-	@echo >>x509.genkey "CN = Glacier signing key"
-	@echo >>x509.genkey "emailAddress = slartibartfast@magrathea.h2g2"
+	@echo >>x509.genkey "#O = Unspecified company"
+	@echo >>x509.genkey "CN = Build time autogenerated kernel key"
+	@echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
 	@echo >>x509.genkey
 	@echo >>x509.genkey "[ myexts ]"
 	@echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
-- 
cgit v1.2.3


From ac01ce1410fc2c7b5f3af5e9c972e6a412eee54f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
Date: Wed, 29 Apr 2015 16:18:46 +0100
Subject: tracing: Make ftrace_print_array_seq compute buf_len
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The only caller to this function (__print_array) was getting it wrong by
passing the array length instead of buffer length. As the element size
was already being passed for other reasons it seems reasonable to push
the calculation of buffer length into the function.

Link: http://lkml.kernel.org/r/1430320727-14582-1-git-send-email-alex.bennee@linaro.org

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 692bf7184c8c..25a086bcb700 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -178,12 +178,13 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 EXPORT_SYMBOL(ftrace_print_hex_seq);
 
 const char *
-ftrace_print_array_seq(struct trace_seq *p, const void *buf, int buf_len,
+ftrace_print_array_seq(struct trace_seq *p, const void *buf, int count,
 		       size_t el_size)
 {
 	const char *ret = trace_seq_buffer_ptr(p);
 	const char *prefix = "";
 	void *ptr = (void *)buf;
+	size_t buf_len = count * el_size;
 
 	trace_seq_putc(p, '{');
 
-- 
cgit v1.2.3


From 0782e63bc6fe7e2d3408d250df11d388b7799c6b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 May 2015 19:49:49 +0200
Subject: sched: Handle priority boosted tasks proper in setscheduler()

Ronny reported that the following scenario is not handled correctly:

	T1 (prio = 10)
	   lock(rtmutex);

	T2 (prio = 20)
	   lock(rtmutex)
	      boost T1

	T1 (prio = 20)
	   sys_set_scheduler(prio = 30)
	   T1 prio = 30
	   ....
	   sys_set_scheduler(prio = 10)
	   T1 prio = 30

The last step is wrong as T1 should now be back at prio 20.

Commit c365c292d059 ("sched: Consider pi boosting in setscheduler()")
only handles the case where a boosted tasks tries to lower its
priority.

Fix it by taking the new effective priority into account for the
decision whether a change of the priority is required.

Reported-by: Ronny Meeus <ronny.meeus@gmail.com>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: <stable@vger.kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Fixes: c365c292d059 ("sched: Consider pi boosting in setscheduler()")
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1505051806060.4225@nanos
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/rtmutex.c | 12 +++++++-----
 kernel/sched/core.c      | 26 ++++++++++++++------------
 2 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b73279367087..b025295f4966 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -265,15 +265,17 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
 }
 
 /*
- * Called by sched_setscheduler() to check whether the priority change
- * is overruled by a possible priority boosting.
+ * Called by sched_setscheduler() to get the priority which will be
+ * effective after the change.
  */
-int rt_mutex_check_prio(struct task_struct *task, int newprio)
+int rt_mutex_get_effective_prio(struct task_struct *task, int newprio)
 {
 	if (!task_has_pi_waiters(task))
-		return 0;
+		return newprio;
 
-	return task_top_pi_waiter(task)->task->prio <= newprio;
+	if (task_top_pi_waiter(task)->task->prio <= newprio)
+		return task_top_pi_waiter(task)->task->prio;
+	return newprio;
 }
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe22f7510bce..34db9bf892a3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3300,15 +3300,18 @@ static void __setscheduler_params(struct task_struct *p,
 
 /* Actually do priority change: must hold pi & rq lock. */
 static void __setscheduler(struct rq *rq, struct task_struct *p,
-			   const struct sched_attr *attr)
+			   const struct sched_attr *attr, bool keep_boost)
 {
 	__setscheduler_params(p, attr);
 
 	/*
-	 * If we get here, there was no pi waiters boosting the
-	 * task. It is safe to use the normal prio.
+	 * Keep a potential priority boosting if called from
+	 * sched_setscheduler().
 	 */
-	p->prio = normal_prio(p);
+	if (keep_boost)
+		p->prio = rt_mutex_get_effective_prio(p, normal_prio(p));
+	else
+		p->prio = normal_prio(p);
 
 	if (dl_prio(p->prio))
 		p->sched_class = &dl_sched_class;
@@ -3408,7 +3411,7 @@ static int __sched_setscheduler(struct task_struct *p,
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
 	int retval, oldprio, oldpolicy = -1, queued, running;
-	int policy = attr->sched_policy;
+	int new_effective_prio, policy = attr->sched_policy;
 	unsigned long flags;
 	const struct sched_class *prev_class;
 	struct rq *rq;
@@ -3590,15 +3593,14 @@ change:
 	oldprio = p->prio;
 
 	/*
-	 * Special case for priority boosted tasks.
-	 *
-	 * If the new priority is lower or equal (user space view)
-	 * than the current (boosted) priority, we just store the new
+	 * Take priority boosted tasks into account. If the new
+	 * effective priority is unchanged, we just store the new
 	 * normal parameters and do not touch the scheduler class and
 	 * the runqueue. This will be done when the task deboost
 	 * itself.
 	 */
-	if (rt_mutex_check_prio(p, newprio)) {
+	new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+	if (new_effective_prio == oldprio) {
 		__setscheduler_params(p, attr);
 		task_rq_unlock(rq, p, &flags);
 		return 0;
@@ -3612,7 +3614,7 @@ change:
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
-	__setscheduler(rq, p, attr);
+	__setscheduler(rq, p, attr, true);
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
@@ -7346,7 +7348,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
 	queued = task_on_rq_queued(p);
 	if (queued)
 		dequeue_task(rq, p, 0);
-	__setscheduler(rq, p, &attr);
+	__setscheduler(rq, p, &attr, false);
 	if (queued) {
 		enqueue_task(rq, p, 0);
 		resched_curr(rq);
-- 
cgit v1.2.3


From 533445c6e53368569e50ab3fb712230c03d523f3 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@osandov.com>
Date: Mon, 4 May 2015 03:09:36 -0700
Subject: sched/core: Fix regression in cpuset_cpu_inactive() for suspend

Commit 3c18d447b3b3 ("sched/core: Check for available DL bandwidth in
cpuset_cpu_inactive()"), a SCHED_DEADLINE bugfix, had a logic error that
caused a regression in setting a CPU inactive during suspend. I ran into
this when a program was failing pthread_setaffinity_np() with EINVAL after
a suspend+wake up.

A simple reproducer:

	$ ./a.out
	sched_setaffinity: Success
	$ systemctl suspend
	$ ./a.out
	sched_setaffinity: Invalid argument

... where ./a.out is:

	#define _GNU_SOURCE
	#include <errno.h>
	#include <sched.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <unistd.h>

	int main(void)
	{
		long num_cores;
		cpu_set_t cpu_set;
		int ret;

		num_cores = sysconf(_SC_NPROCESSORS_ONLN);
		CPU_ZERO(&cpu_set);
		CPU_SET(num_cores - 1, &cpu_set);
		errno = 0;
		ret = sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set);
		perror("sched_setaffinity");
		return ret ? EXIT_FAILURE : EXIT_SUCCESS;
	}

The mistake is that suspend is handled in the action ==
CPU_DOWN_PREPARE_FROZEN case of the switch statement in
cpuset_cpu_inactive().

However, the commit in question masked out CPU_TASKS_FROZEN
from the action, making this case dead.

The fix is straightforward.

Signed-off-by: Omar Sandoval <osandov@osandov.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 3c18d447b3b3 ("sched/core: Check for available DL bandwidth in cpuset_cpu_inactive()")
Link: http://lkml.kernel.org/r/1cb5ecb3d6543c38cce5790387f336f54ec8e2bc.1430733960.git.osandov@osandov.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 34db9bf892a3..57bd333bc4ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6999,27 +6999,23 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
 	unsigned long flags;
 	long cpu = (long)hcpu;
 	struct dl_bw *dl_b;
+	bool overflow;
+	int cpus;
 
-	switch (action & ~CPU_TASKS_FROZEN) {
+	switch (action) {
 	case CPU_DOWN_PREPARE:
-		/* explicitly allow suspend */
-		if (!(action & CPU_TASKS_FROZEN)) {
-			bool overflow;
-			int cpus;
-
-			rcu_read_lock_sched();
-			dl_b = dl_bw_of(cpu);
+		rcu_read_lock_sched();
+		dl_b = dl_bw_of(cpu);
 
-			raw_spin_lock_irqsave(&dl_b->lock, flags);
-			cpus = dl_bw_cpus(cpu);
-			overflow = __dl_overflow(dl_b, cpus, 0, 0);
-			raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+		raw_spin_lock_irqsave(&dl_b->lock, flags);
+		cpus = dl_bw_cpus(cpu);
+		overflow = __dl_overflow(dl_b, cpus, 0, 0);
+		raw_spin_unlock_irqrestore(&dl_b->lock, flags);
 
-			rcu_read_unlock_sched();
+		rcu_read_unlock_sched();
 
-			if (overflow)
-				return notifier_from_errno(-EBUSY);
-		}
+		if (overflow)
+			return notifier_from_errno(-EBUSY);
 		cpuset_update_active_cpus(false);
 		break;
 	case CPU_DOWN_PREPARE_FROZEN:
-- 
cgit v1.2.3


From 8b10c5e2b59ef2a80a07ab594a3b4987a4676211 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 1 May 2015 16:08:46 +0200
Subject: perf: Annotate inherited event ctx->mutex recursion

While fuzzing Sasha tripped over another ctx->mutex recursion lockdep
splat. Annotate this.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81aa3a4ece9f..1a3bf48743ce 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -913,10 +913,30 @@ static void put_ctx(struct perf_event_context *ctx)
  * Those places that change perf_event::ctx will hold both
  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
  *
- * Lock ordering is by mutex address. There is one other site where
- * perf_event_context::mutex nests and that is put_event(). But remember that
- * that is a parent<->child context relation, and migration does not affect
- * children, therefore these two orderings should not interact.
+ * Lock ordering is by mutex address. There are two other sites where
+ * perf_event_context::mutex nests and those are:
+ *
+ *  - perf_event_exit_task_context()	[ child , 0 ]
+ *      __perf_event_exit_task()
+ *        sync_child_event()
+ *          put_event()			[ parent, 1 ]
+ *
+ *  - perf_event_init_context()		[ parent, 0 ]
+ *      inherit_task_group()
+ *        inherit_group()
+ *          inherit_event()
+ *            perf_event_alloc()
+ *              perf_init_event()
+ *                perf_try_init_event()	[ child , 1 ]
+ *
+ * While it appears there is an obvious deadlock here -- the parent and child
+ * nesting levels are inverted between the two. This is in fact safe because
+ * life-time rules separate them. That is an exiting task cannot fork, and a
+ * spawning task cannot (yet) exit.
+ *
+ * But remember that that these are parent<->child context relations, and
+ * migration does not affect children, therefore these two orderings should not
+ * interact.
  *
  * The change in perf_event::ctx does not affect children (as claimed above)
  * because the sys_perf_event_open() case will install a new event and break
@@ -3657,9 +3677,6 @@ static void perf_remove_from_owner(struct perf_event *event)
 	}
 }
 
-/*
- * Called when the last reference to the file is gone.
- */
 static void put_event(struct perf_event *event)
 {
 	struct perf_event_context *ctx;
@@ -3697,6 +3714,9 @@ int perf_event_release_kernel(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 
+/*
+ * Called when the last reference to the file is gone.
+ */
 static int perf_release(struct inode *inode, struct file *file)
 {
 	put_event(file->private_data);
@@ -7364,7 +7384,12 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 		return -ENODEV;
 
 	if (event->group_leader != event) {
-		ctx = perf_event_ctx_lock(event->group_leader);
+		/*
+		 * This ctx->mutex can nest when we're called through
+		 * inheritance. See the perf_event_ctx_lock_nested() comment.
+		 */
+		ctx = perf_event_ctx_lock_nested(event->group_leader,
+						 SINGLE_DEPTH_NESTING);
 		BUG_ON(!ctx);
 	}
 
-- 
cgit v1.2.3


From 37815bf866ab6722a47550f8d25ad3f1a16a680c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Sat, 9 May 2015 03:06:23 +0930
Subject: module: Call module notifier on failure after complete_formation()

The module notifier call chain for MODULE_STATE_COMING was moved up before
the parsing of args, into the complete_formation() call. But if the module failed
to load after that, the notifier call chain for MODULE_STATE_GOING was
never called and that prevented the users of those call chains from
cleaning up anything that was allocated.

Link: http://lkml.kernel.org/r/554C52B9.9060700@gmail.com

Reported-by: Pontus Fuchs <pontus.fuchs@gmail.com>
Fixes: 4982223e51e8 "module: set nx before marking module MODULE_STATE_COMING"
Cc: stable@vger.kernel.org # 3.16+
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 42a1d2afb217..cfc9e843a924 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3370,6 +3370,9 @@ static int load_module(struct load_info *info, const char __user *uargs,
 	module_bug_cleanup(mod);
 	mutex_unlock(&module_mutex);
 
+	blocking_notifier_call_chain(&module_notify_list,
+				     MODULE_STATE_GOING, mod);
+
 	/* we can't deallocate the module until we clear memory protection */
 	unset_module_init_ro_nx(mod);
 	unset_module_core_ro_nx(mod);
-- 
cgit v1.2.3


From f7bcb70ebae0dcdb5a2d859b09e4465784d99029 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 8 May 2015 13:47:23 -0700
Subject: ktime: Fix ktime_divns to do signed division

It was noted that the 32bit implementation of ktime_divns()
was doing unsigned division and didn't properly handle
negative values.

And when a ktime helper was changed to utilize
ktime_divns, it caused a regression on some IR blasters.
See the following bugzilla for details:
  https://bugzilla.redhat.com/show_bug.cgi?id=1200353

This patch fixes the problem in ktime_divns by checking
and preserving the sign bit, and then reapplying it if
appropriate after the division, it also changes the return
type to a s64 to make it more obvious this is expected.

Nicolas also pointed out that negative dividers would
cause infinite loops on 32bit systems, negative dividers
is unlikely for users of this function, but out of caution
this patch adds checks for negative dividers for both
32-bit (BUG_ON) and 64-bit(WARN_ON) versions to make sure
no such use cases creep in.

[ tglx: Hand an u64 to do_div() to avoid the compiler warning ]

Fixes: 166afb64511e 'ktime: Sanitize ktime_to_us/ms conversion'
Reported-and-tested-by: Trevor Cordes <trevor@tecnopolis.ca>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Josh Boyer <jwboyer@redhat.com>
Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/1431118043-23452-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 76d4bd962b19..93ef7190bdea 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -266,21 +266,23 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
 /*
  * Divide a ktime value by a nanosecond value
  */
-u64 __ktime_divns(const ktime_t kt, s64 div)
+s64 __ktime_divns(const ktime_t kt, s64 div)
 {
-	u64 dclc;
 	int sft = 0;
+	s64 dclc;
+	u64 tmp;
 
 	dclc = ktime_to_ns(kt);
+	tmp = dclc < 0 ? -dclc : dclc;
+
 	/* Make sure the divisor is less than 2^32: */
 	while (div >> 32) {
 		sft++;
 		div >>= 1;
 	}
-	dclc >>= sft;
-	do_div(dclc, (unsigned long) div);
-
-	return dclc;
+	tmp >>= sft;
+	do_div(tmp, (unsigned long) div);
+	return dclc < 0 ? -tmp : tmp;
 }
 EXPORT_SYMBOL_GPL(__ktime_divns);
 #endif /* BITS_PER_LONG >= 64 */
-- 
cgit v1.2.3


From ab992dc38f9ae40b3ab996d68449692d464c98cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 18 May 2015 11:31:50 +0200
Subject: watchdog: Fix merge 'conflict'

Two watchdog changes that came through different trees had a non
conflicting conflict, that is, one changed the semantics of a variable
but no actual code conflict happened. So the merge appeared fine, but
the resulting code did not behave as expected.

Commit 195daf665a62 ("watchdog: enable the new user interface of the
watchdog mechanism") changes the semantics of watchdog_user_enabled,
which thereafter is only used by the functions introduced by
b3738d293233 ("watchdog: Add watchdog enable/disable all functions").

There further appears to be a distinct lack of serialization between
setting and using watchdog_enabled, so perhaps we should wrap the
{en,dis}able_all() things in watchdog_proc_mutex.

This patch fixes a s2r failure reported by Michal; which I cannot
readily explain. But this does make the code internally consistent
again.

Reported-and-tested-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 2316f50b07a4..506edcc500c4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -41,6 +41,8 @@
 #define NMI_WATCHDOG_ENABLED      (1 << NMI_WATCHDOG_ENABLED_BIT)
 #define SOFT_WATCHDOG_ENABLED     (1 << SOFT_WATCHDOG_ENABLED_BIT)
 
+static DEFINE_MUTEX(watchdog_proc_mutex);
+
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
 #else
@@ -608,26 +610,36 @@ void watchdog_nmi_enable_all(void)
 {
 	int cpu;
 
-	if (!watchdog_user_enabled)
-		return;
+	mutex_lock(&watchdog_proc_mutex);
+
+	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+		goto unlock;
 
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		watchdog_nmi_enable(cpu);
 	put_online_cpus();
+
+unlock:
+	mutex_lock(&watchdog_proc_mutex);
 }
 
 void watchdog_nmi_disable_all(void)
 {
 	int cpu;
 
+	mutex_lock(&watchdog_proc_mutex);
+
 	if (!watchdog_running)
-		return;
+		goto unlock;
 
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		watchdog_nmi_disable(cpu);
 	put_online_cpus();
+
+unlock:
+	mutex_unlock(&watchdog_proc_mutex);
 }
 #else
 static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
@@ -744,8 +756,6 @@ static int proc_watchdog_update(void)
 
 }
 
-static DEFINE_MUTEX(watchdog_proc_mutex);
-
 /*
  * common function for watchdog, nmi_watchdog and soft_watchdog parameter
  *
-- 
cgit v1.2.3


From 10d784eae2b41e25d8fc6a88096cd27286093c84 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Fri, 8 May 2015 10:51:29 -0700
Subject: sched: always use blk_schedule_flush_plug in io_schedule_out

block plug callback could sleep, so we introduce a parameter
'from_schedule' and corresponding drivers can use it to destinguish a
schedule plug flush or a plug finish. Unfortunately io_schedule_out
still uses blk_flush_plug(). This causes below output (Note, I added a
might_sleep() in raid1_unplug to make it trigger faster, but the whole
thing doesn't matter if I add might_sleep). In raid1/10, this can cause
deadlock.

This patch makes io_schedule_out always uses blk_schedule_flush_plug.
This should only impact drivers (as far as I know, raid 1/10) which are
sensitive to the 'from_schedule' parameter.

[  370.817949] ------------[ cut here ]------------
[  370.817960] WARNING: CPU: 7 PID: 145 at ../kernel/sched/core.c:7306 __might_sleep+0x7f/0x90()
[  370.817969] do not call blocking ops when !TASK_RUNNING; state=2 set at [<ffffffff81092fcf>] prepare_to_wait+0x2f/0x90
[  370.817971] Modules linked in: raid1
[  370.817976] CPU: 7 PID: 145 Comm: kworker/u16:9 Tainted: G        W       4.0.0+ #361
[  370.817977] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140709_153802- 04/01/2014
[  370.817983] Workqueue: writeback bdi_writeback_workfn (flush-9:1)
[  370.817985]  ffffffff81cd83be ffff8800ba8cb298 ffffffff819dd7af 0000000000000001
[  370.817988]  ffff8800ba8cb2e8 ffff8800ba8cb2d8 ffffffff81051afc ffff8800ba8cb2c8
[  370.817990]  ffffffffa00061a8 000000000000041e 0000000000000000 ffff8800ba8cba28
[  370.817993] Call Trace:
[  370.817999]  [<ffffffff819dd7af>] dump_stack+0x4f/0x7b
[  370.818002]  [<ffffffff81051afc>] warn_slowpath_common+0x8c/0xd0
[  370.818004]  [<ffffffff81051b86>] warn_slowpath_fmt+0x46/0x50
[  370.818006]  [<ffffffff81092fcf>] ? prepare_to_wait+0x2f/0x90
[  370.818008]  [<ffffffff81092fcf>] ? prepare_to_wait+0x2f/0x90
[  370.818010]  [<ffffffff810776ef>] __might_sleep+0x7f/0x90
[  370.818014]  [<ffffffffa0000c03>] raid1_unplug+0xd3/0x170 [raid1]
[  370.818024]  [<ffffffff81421d9a>] blk_flush_plug_list+0x8a/0x1e0
[  370.818028]  [<ffffffff819e3550>] ? bit_wait+0x50/0x50
[  370.818031]  [<ffffffff819e21b0>] io_schedule_timeout+0x130/0x140
[  370.818033]  [<ffffffff819e3586>] bit_wait_io+0x36/0x50
[  370.818034]  [<ffffffff819e31b5>] __wait_on_bit+0x65/0x90
[  370.818041]  [<ffffffff8125b67c>] ? ext4_read_block_bitmap_nowait+0xbc/0x630
[  370.818043]  [<ffffffff819e3550>] ? bit_wait+0x50/0x50
[  370.818045]  [<ffffffff819e3302>] out_of_line_wait_on_bit+0x72/0x80
[  370.818047]  [<ffffffff810935e0>] ? autoremove_wake_function+0x40/0x40
[  370.818050]  [<ffffffff811de744>] __wait_on_buffer+0x44/0x50
[  370.818053]  [<ffffffff8125ae80>] ext4_wait_block_bitmap+0xe0/0xf0
[  370.818058]  [<ffffffff812975d6>] ext4_mb_init_cache+0x206/0x790
[  370.818062]  [<ffffffff8114bc6c>] ? lru_cache_add+0x1c/0x50
[  370.818064]  [<ffffffff81297c7e>] ext4_mb_init_group+0x11e/0x200
[  370.818066]  [<ffffffff81298231>] ext4_mb_load_buddy+0x341/0x360
[  370.818068]  [<ffffffff8129a1a3>] ext4_mb_find_by_goal+0x93/0x2f0
[  370.818070]  [<ffffffff81295b54>] ? ext4_mb_normalize_request+0x1e4/0x5b0
[  370.818072]  [<ffffffff8129ab67>] ext4_mb_regular_allocator+0x67/0x460
[  370.818074]  [<ffffffff81295b54>] ? ext4_mb_normalize_request+0x1e4/0x5b0
[  370.818076]  [<ffffffff8129ca4b>] ext4_mb_new_blocks+0x4cb/0x620
[  370.818079]  [<ffffffff81290956>] ext4_ext_map_blocks+0x4c6/0x14d0
[  370.818081]  [<ffffffff812a4d4e>] ? ext4_es_lookup_extent+0x4e/0x290
[  370.818085]  [<ffffffff8126399d>] ext4_map_blocks+0x14d/0x4f0
[  370.818088]  [<ffffffff81266fbd>] ext4_writepages+0x76d/0xe50
[  370.818094]  [<ffffffff81149691>] do_writepages+0x21/0x50
[  370.818097]  [<ffffffff811d5c00>] __writeback_single_inode+0x60/0x490
[  370.818099]  [<ffffffff811d630a>] writeback_sb_inodes+0x2da/0x590
[  370.818103]  [<ffffffff811abf4b>] ? trylock_super+0x1b/0x50
[  370.818105]  [<ffffffff811abf4b>] ? trylock_super+0x1b/0x50
[  370.818107]  [<ffffffff811d665f>] __writeback_inodes_wb+0x9f/0xd0
[  370.818109]  [<ffffffff811d69db>] wb_writeback+0x34b/0x3c0
[  370.818111]  [<ffffffff811d70df>] bdi_writeback_workfn+0x23f/0x550
[  370.818116]  [<ffffffff8106bbd8>] process_one_work+0x1c8/0x570
[  370.818117]  [<ffffffff8106bb5b>] ? process_one_work+0x14b/0x570
[  370.818119]  [<ffffffff8106c09b>] worker_thread+0x11b/0x470
[  370.818121]  [<ffffffff8106bf80>] ? process_one_work+0x570/0x570
[  370.818124]  [<ffffffff81071868>] kthread+0xf8/0x110
[  370.818126]  [<ffffffff81071770>] ? kthread_create_on_node+0x210/0x210
[  370.818129]  [<ffffffff819e9322>] ret_from_fork+0x42/0x70
[  370.818131]  [<ffffffff81071770>] ? kthread_create_on_node+0x210/0x210
[  370.818132] ---[ end trace 7b4deb71e68b6605 ]---

V2: don't change ->in_iowait

Cc: NeilBrown <neilb@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 kernel/sched/core.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe22f7510bce..cfeebb499e79 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4387,10 +4387,7 @@ long __sched io_schedule_timeout(long timeout)
 	long ret;
 
 	current->in_iowait = 1;
-	if (old_iowait)
-		blk_schedule_flush_plug(current);
-	else
-		blk_flush_plug(current);
+	blk_schedule_flush_plug(current);
 
 	delayacct_blkio_start();
 	rq = raw_rq();
-- 
cgit v1.2.3


From 1173ff09b9c57be8248427b7be161f7599dccd6b Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 19 May 2015 09:07:27 +0200
Subject: watchdog: fix double lock in watchdog_nmi_enable_all

Commit ab992dc38f9a ("watchdog: Fix merge 'conflict'") has introduced an
obvious deadlock because of a typo.  watchdog_proc_mutex should be
unlocked on exit.

Thanks to Miroslav Benes who was staring at the code with me and noticed
this.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Duh-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/watchdog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 506edcc500c4..581a68a04c64 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -621,7 +621,7 @@ void watchdog_nmi_enable_all(void)
 	put_online_cpus();
 
 unlock:
-	mutex_lock(&watchdog_proc_mutex);
+	mutex_unlock(&watchdog_proc_mutex);
 }
 
 void watchdog_nmi_disable_all(void)
-- 
cgit v1.2.3