From 9642d18eee2cd169b60c6ac0f20bda745b5a3d1e Mon Sep 17 00:00:00 2001
From: Vatika Harlalka <vatikaharlalka@gmail.com>
Date: Tue, 1 Sep 2015 16:50:59 +0200
Subject: nohz: Affine unpinned timers to housekeepers

The problem addressed in this patch is about affining unpinned
timers. Adaptive or Full Dynticks CPUs are currently disturbed
by unnecessary jitter due to firing of such timers on them.

This patch will affine timers to online CPUs which are not full
dynticks in NOHZ_FULL configured systems. It should not
introduce overhead in nohz full off case due to static keys.

Signed-off-by: Vatika Harlalka <vatikaharlalka@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1441119060-2230-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b864ecee0e1..0902e4d72671 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -623,18 +623,21 @@ int get_nohz_timer_target(void)
 	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (!idle_cpu(cpu))
+	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
 		return cpu;
 
 	rcu_read_lock();
 	for_each_domain(cpu, sd) {
 		for_each_cpu(i, sched_domain_span(sd)) {
-			if (!idle_cpu(i)) {
+			if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
 				cpu = i;
 				goto unlock;
 			}
 		}
 	}
+
+	if (!is_housekeeping_cpu(cpu))
+		cpu = housekeeping_any_cpu();
 unlock:
 	rcu_read_unlock();
 	return cpu;
-- 
cgit v1.2.3


From 7c8bb6cb95061b3143759459ed6c6b0c73bcfecb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 1 Sep 2015 16:51:00 +0200
Subject: nohz: Assert existing housekeepers when nohz full enabled

The code ensures that when nohz full is running, at least the
boot CPU serves as a housekeeper and it can't be later offlined.

Let's assert this assumption to make sure that we have CPUs to
handle unbound jobs like workqueues and timers while nohz full
CPUs run undisturbed.

Also improve the comments on housekeeper offlining prevention.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Vatika Harlalka <vatikaharlalka@gmail.com>
Link: http://lkml.kernel.org/r/1441119060-2230-3-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3319e16f31e5..7c7ec4515983 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -290,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str)
 __setup("nohz_full=", tick_nohz_full_setup);
 
 static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
-						 unsigned long action,
-						 void *hcpu)
+				       unsigned long action,
+				       void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_DOWN_PREPARE:
 		/*
-		 * If we handle the timekeeping duty for full dynticks CPUs,
-		 * we can't safely shutdown that CPU.
+		 * The boot CPU handles housekeeping duty (unbound timers,
+		 * workqueues, timekeeping, ...) on behalf of full dynticks
+		 * CPUs. It must remain online when nohz full is enabled.
 		 */
 		if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
 			return NOTIFY_BAD;
@@ -370,6 +371,12 @@ void __init tick_nohz_init(void)
 	cpu_notifier(tick_nohz_cpu_down_callback, 0);
 	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
 		cpumask_pr_args(tick_nohz_full_mask));
+
+	/*
+	 * We need at least one CPU to handle housekeeping work such
+	 * as timekeeping, unbound timers, workqueues, ...
+	 */
+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
 #endif
 
-- 
cgit v1.2.3


From 21dd33b09c61597df603c654589adffd7955491a Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Wed, 2 Sep 2015 16:18:57 -0600
Subject: kernel/cpu_pm: fix cpu_cluster_pm_exit comment

cpu_cluster_pm_exit() must be sent after cpu_cluster_pm_enter() has been
sent for the cluster and before any cpu_pm_exit() notifications are sent
for any CPU.

Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Acked-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/cpu_pm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 9656a3c36503..009cc9a17d95 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
  * low power state that may have caused some blocks in the same power domain
  * to reset.
  *
- * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * Must be called after cpu_cluster_pm_enter has been called for the power
  * domain, and before cpu_pm_exit has been called on any cpu in the power
  * domain. Notified drivers can include VFP co-processor, interrupt controller
  * and its PM extensions, local CPU timers context save/restore which
-- 
cgit v1.2.3


From 43b3f02899f74ae9914a39547cc5492156f0027a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 4 Sep 2015 17:25:23 +0200
Subject: locking/qspinlock/x86: Fix performance regression under unaccelerated
 VMs

Dave ran into horrible performance on a VM without PARAVIRT_SPINLOCKS
set and Linus noted that the test-and-set implementation was retarded.

One should spin on the variable with a load, not a RMW.

While there, remove 'queued' from the name, as the lock isn't queued
at all, but a simple test-and-set.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Reported-by: Dave Chinner <david@fromorbit.com>
Tested-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Waiman Long <Waiman.Long@hp.com>
Cc: stable@vger.kernel.org # v4.2+
Link: http://lkml.kernel.org/r/20150904152523.GR18673@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/qspinlock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 337c8818541d..87e9ce6a63c5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 	if (pv_enabled())
 		goto queue;
 
-	if (virt_queued_spin_lock(lock))
+	if (virt_spin_lock(lock))
 		return;
 
 	/*
-- 
cgit v1.2.3


From 5473e0cc37c03c576adbda7591a6cc8e37c1bb7f Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Fri, 28 Aug 2015 14:55:56 +0800
Subject: sched: 'Annotate' migrate_tasks()

Kernel testing triggered this warning:

| WARNING: CPU: 0 PID: 13 at kernel/sched/core.c:1156 do_set_cpus_allowed+0x7e/0x80()
| Modules linked in:
| CPU: 0 PID: 13 Comm: migration/0 Not tainted 4.2.0-rc1-00049-g25834c7 #2
| Call Trace:
|   dump_stack+0x4b/0x75
|   warn_slowpath_common+0x8b/0xc0
|   warn_slowpath_null+0x22/0x30
|   do_set_cpus_allowed+0x7e/0x80
|   cpuset_cpus_allowed_fallback+0x7c/0x170
|   select_fallback_rq+0x221/0x280
|   migration_call+0xe3/0x250
|   notifier_call_chain+0x53/0x70
|   __raw_notifier_call_chain+0x1e/0x30
|   cpu_notify+0x28/0x50
|   take_cpu_down+0x22/0x40
|   multi_cpu_stop+0xd5/0x140
|   cpu_stopper_thread+0xbc/0x170
|   smpboot_thread_fn+0x174/0x2f0
|   kthread+0xc4/0xe0
|   ret_from_kernel_thread+0x21/0x30

As Peterz pointed out:

| So the normal rules for changing task_struct::cpus_allowed are holding
| both pi_lock and rq->lock, such that holding either stabilizes the mask.
|
| This is so that wakeup can happen without rq->lock and load-balance
| without pi_lock.
|
| From this we already get the relaxation that we can omit acquiring
| rq->lock if the task is not on the rq, because in that case
| load-balancing will not apply to it.
|
| ** these are the rules currently tested in do_set_cpus_allowed() **
|
| Now, since __set_cpus_allowed_ptr() uses task_rq_lock() which
| unconditionally acquires both locks, we could get away with holding just
| rq->lock when on_rq for modification because that'd still exclude
| __set_cpus_allowed_ptr(), it would also work against
| __kthread_bind_mask() because that assumes !on_rq.
|
| That said, this is all somewhat fragile.
|
| Now, I don't think dropping rq->lock is quite as disastrous as it
| usually is because !cpu_active at this point, which means load-balance
| will not interfere, but that too is somewhat fragile.
|
| So we end up with a choice of two fragile..

This patch fixes it by following the rules for changing
task_struct::cpus_allowed with both pi_lock and rq->lock held.

Reported-by: kernel test robot <ying.huang@intel.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
[ Modified changelog and patch. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/BLU436-SMTP1660820490DE202E3934ED3806E0@phx.gbl
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0902e4d72671..9b786704d34b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5183,24 +5183,47 @@ static void migrate_tasks(struct rq *dead_rq)
 			break;
 
 		/*
-		 * Ensure rq->lock covers the entire task selection
-		 * until the migration.
+		 * pick_next_task assumes pinned rq->lock.
 		 */
 		lockdep_pin_lock(&rq->lock);
 		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
+		/*
+		 * Rules for changing task_struct::cpus_allowed are holding
+		 * both pi_lock and rq->lock, such that holding either
+		 * stabilizes the mask.
+		 *
+		 * Drop rq->lock is not quite as disastrous as it usually is
+		 * because !cpu_active at this point, which means load-balance
+		 * will not interfere. Also, stop-machine.
+		 */
+		lockdep_unpin_lock(&rq->lock);
+		raw_spin_unlock(&rq->lock);
+		raw_spin_lock(&next->pi_lock);
+		raw_spin_lock(&rq->lock);
+
+		/*
+		 * Since we're inside stop-machine, _nothing_ should have
+		 * changed the task, WARN if weird stuff happened, because in
+		 * that case the above rq->lock drop is a fail too.
+		 */
+		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+			raw_spin_unlock(&next->pi_lock);
+			continue;
+		}
+
 		/* Find suitable destination for @next, with force if needed. */
 		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
 
-		lockdep_unpin_lock(&rq->lock);
 		rq = __migrate_task(rq, next, dest_cpu);
 		if (rq != dead_rq) {
 			raw_spin_unlock(&rq->lock);
 			rq = dead_rq;
 			raw_spin_lock(&rq->lock);
 		}
+		raw_spin_unlock(&next->pi_lock);
 	}
 
 	rq->stop = stop;
-- 
cgit v1.2.3


From 5b25b13ab08f616efd566347d809b4ece54570d1 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Fri, 11 Sep 2015 13:07:39 -0700
Subject: sys_membarrier(): system-wide memory barrier (generic, x86)

Here is an implementation of a new system call, sys_membarrier(), which
executes a memory barrier on all threads running on the system.  It is
implemented by calling synchronize_sched().  It can be used to
distribute the cost of user-space memory barriers asymmetrically by
transforming pairs of memory barriers into pairs consisting of
sys_membarrier() and a compiler barrier.  For synchronization primitives
that distinguish between read-side and write-side (e.g.  userspace RCU
[1], rwlocks), the read-side can be accelerated significantly by moving
the bulk of the memory barrier overhead to the write-side.

The existing applications of which I am aware that would be improved by
this system call are as follows:

* Through Userspace RCU library (http://urcu.so)
  - DNS server (Knot DNS) https://www.knot-dns.cz/
  - Network sniffer (http://netsniff-ng.org/)
  - Distributed object storage (https://sheepdog.github.io/sheepdog/)
  - User-space tracing (http://lttng.org)
  - Network storage system (https://www.gluster.org/)
  - Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf)
  - Financial software (https://lkml.org/lkml/2015/3/23/189)

Those projects use RCU in userspace to increase read-side speed and
scalability compared to locking.  Especially in the case of RCU used by
libraries, sys_membarrier can speed up the read-side by moving the bulk of
the memory barrier cost to synchronize_rcu().

* Direct users of sys_membarrier
  - core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198)

Microsoft core dotnet GC developers are planning to use the mprotect()
side-effect of issuing memory barriers through IPIs as a way to implement
Windows FlushProcessWriteBuffers() on Linux.  They are referring to
sys_membarrier in their github thread, specifically stating that
sys_membarrier() is what they are looking for.

To explain the benefit of this scheme, let's introduce two example threads:

Thread A (non-frequent, e.g. executing liburcu synchronize_rcu())
Thread B (frequent, e.g. executing liburcu
rcu_read_lock()/rcu_read_unlock())

In a scheme where all smp_mb() in thread A are ordering memory accesses
with respect to smp_mb() present in Thread B, we can change each
smp_mb() within Thread A into calls to sys_membarrier() and each
smp_mb() within Thread B into compiler barriers "barrier()".

Before the change, we had, for each smp_mb() pairs:

Thread A                    Thread B
previous mem accesses       previous mem accesses
smp_mb()                    smp_mb()
following mem accesses      following mem accesses

After the change, these pairs become:

Thread A                    Thread B
prev mem accesses           prev mem accesses
sys_membarrier()            barrier()
follow mem accesses         follow mem accesses

As we can see, there are two possible scenarios: either Thread B memory
accesses do not happen concurrently with Thread A accesses (1), or they
do (2).

1) Non-concurrent Thread A vs Thread B accesses:

Thread A                    Thread B
prev mem accesses
sys_membarrier()
follow mem accesses
                            prev mem accesses
                            barrier()
                            follow mem accesses

In this case, thread B accesses will be weakly ordered. This is OK,
because at that point, thread A is not particularly interested in
ordering them with respect to its own accesses.

2) Concurrent Thread A vs Thread B accesses

Thread A                    Thread B
prev mem accesses           prev mem accesses
sys_membarrier()            barrier()
follow mem accesses         follow mem accesses

In this case, thread B accesses, which are ensured to be in program
order thanks to the compiler barrier, will be "upgraded" to full
smp_mb() by synchronize_sched().

* Benchmarks

On Intel Xeon E5405 (8 cores)
(one thread is calling sys_membarrier, the other 7 threads are busy
looping)

1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call.

* User-space user of this system call: Userspace RCU library

Both the signal-based and the sys_membarrier userspace RCU schemes
permit us to remove the memory barrier from the userspace RCU
rcu_read_lock() and rcu_read_unlock() primitives, thus significantly
accelerating them. These memory barriers are replaced by compiler
barriers on the read-side, and all matching memory barriers on the
write-side are turned into an invocation of a memory barrier on all
active threads in the process. By letting the kernel perform this
synchronization rather than dumbly sending a signal to every process
threads (as we currently do), we diminish the number of unnecessary wake
ups and only issue the memory barriers on active threads. Non-running
threads do not need to execute such barrier anyway, because these are
implied by the scheduler context switches.

Results in liburcu:

Operations in 10s, 6 readers, 2 writers:

memory barriers in reader:    1701557485 reads, 2202847 writes
signal-based scheme:          9830061167 reads,    6700 writes
sys_membarrier:               9952759104 reads,     425 writes
sys_membarrier (dyn. check):  7970328887 reads,     425 writes

The dynamic sys_membarrier availability check adds some overhead to
the read-side compared to the signal-based scheme, but besides that,
sys_membarrier slightly outperforms the signal-based scheme. However,
this non-expedited sys_membarrier implementation has a much slower grace
period than signal and memory barrier schemes.

Besides diminishing the number of wake-ups, one major advantage of the
membarrier system call over the signal-based scheme is that it does not
need to reserve a signal. This plays much more nicely with libraries,
and with processes injected into for tracing purposes, for which we
cannot expect that signals will be unused by the application.

An expedited version of this system call can be added later on to speed
up the grace period. Its implementation will likely depend on reading
the cpu_curr()->mm without holding each CPU's rq lock.

This patch adds the system call to x86 and to asm-generic.

[1] http://urcu.so

membarrier(2) man page:

MEMBARRIER(2)              Linux Programmer's Manual             MEMBARRIER(2)

NAME
       membarrier - issue memory barriers on a set of threads

SYNOPSIS
       #include <linux/membarrier.h>

       int membarrier(int cmd, int flags);

DESCRIPTION
       The cmd argument is one of the following:

       MEMBARRIER_CMD_QUERY
              Query  the  set  of  supported commands. It returns a bitmask of
              supported commands.

       MEMBARRIER_CMD_SHARED
              Execute a memory barrier on all threads running on  the  system.
              Upon  return from system call, the caller thread is ensured that
              all running threads have passed through a state where all memory
              accesses  to  user-space  addresses  match program order between
              entry to and return from the system  call  (non-running  threads
              are de facto in such a state). This covers threads from all pro=E2=80=90
              cesses running on the system.  This command returns 0.

       The flags argument needs to be 0. For future extensions.

       All memory accesses performed  in  program  order  from  each  targeted
       thread is guaranteed to be ordered with respect to sys_membarrier(). If
       we use the semantic "barrier()" to represent a compiler barrier forcing
       memory  accesses  to  be performed in program order across the barrier,
       and smp_mb() to represent explicit memory barriers forcing full  memory
       ordering  across  the barrier, we have the following ordering table for
       each pair of barrier(), sys_membarrier() and smp_mb():

       The pair ordering is detailed as (O: ordered, X: not ordered):

                              barrier()   smp_mb() sys_membarrier()
              barrier()          X           X            O
              smp_mb()           X           O            O
              sys_membarrier()   O           O            O

RETURN VALUE
       On success, these system calls return zero.  On error, -1 is  returned,
       and errno is set appropriately. For a given command, with flags
       argument set to 0, this system call is guaranteed to always return the
       same value until reboot.

ERRORS
       ENOSYS System call is not implemented.

       EINVAL Invalid arguments.

Linux                             2015-04-15                     MEMBARRIER(2)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile     |  1 +
 kernel/membarrier.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sys_ni.c     |  3 +++
 3 files changed, 70 insertions(+)
 create mode 100644 kernel/membarrier.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index d4988410b410..53abf008ecb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
new file mode 100644
index 000000000000..536c727a56e9
--- /dev/null
+++ b/kernel/membarrier.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, or if the command argument is invalid,
+ * this system call returns -EINVAL. For a given command, with flags argument
+ * set to 0, this system call is guaranteed to always return the same value
+ * until reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	switch (cmd) {
+	case MEMBARRIER_CMD_QUERY:
+		return MEMBARRIER_CMD_BITMASK;
+	case MEMBARRIER_CMD_SHARED:
+		if (num_online_cpus() > 1)
+			synchronize_sched();
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 03c3875d9958..a02decf15583 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -245,3 +245,6 @@ cond_syscall(sys_bpf);
 
 /* execveat */
 cond_syscall(sys_execveat);
+
+/* membarrier */
+cond_syscall(sys_membarrier);
-- 
cgit v1.2.3


From 2619d7e9c92d524cb155ec89fd72875321512e5b Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 9 Sep 2015 16:07:30 -0700
Subject: time: Fix timekeeping_freqadjust()'s incorrect use of abs() instead
 of abs64()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The internal clocksteering done for fine-grained error
correction uses a logarithmic approximation, so any time
adjtimex() adjusts the clock steering, timekeeping_freqadjust()
quickly approximates the correct clock frequency over a series
of ticks.

Unfortunately, the logic in timekeeping_freqadjust(), introduced
in commit:

  dc491596f639 ("timekeeping: Rework frequency adjustments to work better w/ nohz")

used the abs() function with a s64 error value to calculate the
size of the approximated adjustment to be made.

Per include/linux/kernel.h:

  "abs() should not be used for 64-bit types (s64, u64, long long) - use abs64()".

Thus on 32-bit platforms, this resulted in the clocksteering to
take a quite dampended random walk trying to converge on the
proper frequency, which caused the adjustments to be made much
slower then intended (most easily observed when large
adjustments are made).

This patch fixes the issue by using abs64() instead.

Reported-by: Nuno Gonçalves <nunojpg@gmail.com>
Tested-by: Nuno Goncalves <nunojpg@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: <stable@vger.kernel.org> # v3.17+
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Miroslav Lichvar <mlichvar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1441840051-20244-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/timekeeping.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f6ee2e6b6f5d..3739ac6aa473 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
 	negative = (tick_error < 0);
 
 	/* Sort out the magnitude of the correction */
-	tick_error = abs(tick_error);
+	tick_error = abs64(tick_error);
 	for (adj = 0; tick_error > interval; adj++)
 		tick_error >>= 1;
 
-- 
cgit v1.2.3


From eef7635a22f6b144206b5ca2f1398f637acffc4d Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Fri, 11 Sep 2015 09:34:26 +0530
Subject: clockevents: Remove unused set_mode() callback

All users are migrated to the per-state callbacks, get rid of the
unused interface and the core support code.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linaro-kernel@lists.linaro.org
Cc: John Stultz <john.stultz@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/fd60de14cf6d125489c031207567bb255ad946f6.1441943991.git.viresh.kumar@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/clockevents.c | 42 +-----------------------------------
 kernel/time/tick-common.c |  1 -
 kernel/time/timer_list.c  | 54 +++++++++++++++++++++--------------------------
 3 files changed, 25 insertions(+), 72 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 50eb107f1198..a9b76a40319e 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 static int __clockevents_switch_state(struct clock_event_device *dev,
 				      enum clock_event_state state)
 {
-	/* Transition with legacy set_mode() callback */
-	if (dev->set_mode) {
-		/* Legacy callback doesn't support new modes */
-		if (state > CLOCK_EVT_STATE_ONESHOT)
-			return -ENOSYS;
-		/*
-		 * 'clock_event_state' and 'clock_event_mode' have 1-to-1
-		 * mapping until *_ONESHOT, and so a simple cast will work.
-		 */
-		dev->set_mode((enum clock_event_mode)state, dev);
-		dev->mode = (enum clock_event_mode)state;
-		return 0;
-	}
-
 	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
 		return 0;
 
@@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev)
 {
 	int ret = 0;
 
-	if (dev->set_mode) {
-		dev->set_mode(CLOCK_EVT_MODE_RESUME, dev);
-		dev->mode = CLOCK_EVT_MODE_RESUME;
-	} else if (dev->tick_resume) {
+	if (dev->tick_resume)
 		ret = dev->tick_resume(dev);
-	}
 
 	return ret;
 }
@@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu)
 }
 EXPORT_SYMBOL_GPL(clockevents_unbind_device);
 
-/* Sanity check of state transition callbacks */
-static int clockevents_sanity_check(struct clock_event_device *dev)
-{
-	/* Legacy set_mode() callback */
-	if (dev->set_mode) {
-		/* We shouldn't be supporting new modes now */
-		WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
-			dev->set_state_shutdown || dev->tick_resume ||
-			dev->set_state_oneshot_stopped);
-
-		BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
-		return 0;
-	}
-
-	if (dev->features & CLOCK_EVT_FEAT_DUMMY)
-		return 0;
-
-	return 0;
-}
-
 /**
  * clockevents_register_device - register a clock event device
  * @dev:	device to register
@@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev)
 {
 	unsigned long flags;
 
-	BUG_ON(clockevents_sanity_check(dev));
-
 	/* Initialize state to DETACHED */
 	clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index d11c55b6ab7d..4fcd99e12aa0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -398,7 +398,6 @@ void tick_shutdown(unsigned int cpu)
 		 * the set mode function!
 		 */
 		clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
-		dev->mode = CLOCK_EVT_MODE_UNUSED;
 		clockevents_exchange_device(dev, NULL);
 		dev->event_handler = clockevents_handle_noop;
 		td->evtdev = NULL;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 129c96033e46..f75e35b60149 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 		   (unsigned long long) dev->min_delta_ns);
 	SEQ_printf(m, " mult:           %u\n", dev->mult);
 	SEQ_printf(m, " shift:          %u\n", dev->shift);
-	SEQ_printf(m, " mode:           %d\n", dev->mode);
+	SEQ_printf(m, " mode:           %d\n", clockevent_get_state(dev));
 	SEQ_printf(m, " next_event:     %Ld nsecs\n",
 		   (unsigned long long) ktime_to_ns(dev->next_event));
 
@@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	print_name_offset(m, dev->set_next_event);
 	SEQ_printf(m, "\n");
 
-	if (dev->set_mode) {
-		SEQ_printf(m, " set_mode:       ");
-		print_name_offset(m, dev->set_mode);
+	if (dev->set_state_shutdown) {
+		SEQ_printf(m, " shutdown: ");
+		print_name_offset(m, dev->set_state_shutdown);
 		SEQ_printf(m, "\n");
-	} else {
-		if (dev->set_state_shutdown) {
-			SEQ_printf(m, " shutdown: ");
-			print_name_offset(m, dev->set_state_shutdown);
-			SEQ_printf(m, "\n");
-		}
+	}
 
-		if (dev->set_state_periodic) {
-			SEQ_printf(m, " periodic: ");
-			print_name_offset(m, dev->set_state_periodic);
-			SEQ_printf(m, "\n");
-		}
+	if (dev->set_state_periodic) {
+		SEQ_printf(m, " periodic: ");
+		print_name_offset(m, dev->set_state_periodic);
+		SEQ_printf(m, "\n");
+	}
 
-		if (dev->set_state_oneshot) {
-			SEQ_printf(m, " oneshot:  ");
-			print_name_offset(m, dev->set_state_oneshot);
-			SEQ_printf(m, "\n");
-		}
+	if (dev->set_state_oneshot) {
+		SEQ_printf(m, " oneshot:  ");
+		print_name_offset(m, dev->set_state_oneshot);
+		SEQ_printf(m, "\n");
+	}
 
-		if (dev->set_state_oneshot_stopped) {
-			SEQ_printf(m, " oneshot stopped: ");
-			print_name_offset(m, dev->set_state_oneshot_stopped);
-			SEQ_printf(m, "\n");
-		}
+	if (dev->set_state_oneshot_stopped) {
+		SEQ_printf(m, " oneshot stopped: ");
+		print_name_offset(m, dev->set_state_oneshot_stopped);
+		SEQ_printf(m, "\n");
+	}
 
-		if (dev->tick_resume) {
-			SEQ_printf(m, " resume:   ");
-			print_name_offset(m, dev->tick_resume);
-			SEQ_printf(m, "\n");
-		}
+	if (dev->tick_resume) {
+		SEQ_printf(m, " resume:   ");
+		print_name_offset(m, dev->tick_resume);
+		SEQ_printf(m, "\n");
 	}
 
 	SEQ_printf(m, " event_handler:  ");
-- 
cgit v1.2.3


From 449e9cae58b06be1293858ec8e5d8cb728238baa Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:16 +0800
Subject: genirq: Move field 'node' from irq_data into irq_common_data

NUMA node information is per-irq instead of per-irqchip, so move it into
struct irq_common_data. Also use CONFIG_NUMA to guard irq_common_data.node.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: http://lkml.kernel.org/r/1433145945-789-8-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/internals.h | 2 +-
 kernel/irq/irqdesc.c   | 4 +++-
 kernel/irq/irqdomain.c | 1 -
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index eee4b385cffb..5ef0c2dbe930 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -194,7 +194,7 @@ static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
 
 static inline int irq_desc_get_node(struct irq_desc *desc)
 {
-	return irq_data_get_node(&desc->irq_data);
+	return irq_common_data_get_node(&desc->irq_common_data);
 }
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0a2a4b697bcb..7f3e9faa6e4d 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -52,11 +52,13 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
 
 static void desc_smp_init(struct irq_desc *desc, int node)
 {
-	desc->irq_data.node = node;
 	cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_clear(desc->pending_mask);
 #endif
+#ifdef CONFIG_NUMA
+	desc->irq_common_data.node = node;
+#endif
 }
 
 #else
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 79baaf8a7813..dc9d27c0c158 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -844,7 +844,6 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
 		child->parent_data = irq_data;
 		irq_data->irq = child->irq;
 		irq_data->common = child->common;
-		irq_data->node = child->node;
 		irq_data->domain = domain;
 	}
 
-- 
cgit v1.2.3


From af7080e040d223b5e7d0a8de28f7cea24ef017c4 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:21 +0800
Subject: genirq: Move field 'handler_data' from irq_data into irq_common_data

Handler data (handler_data) is per-irq instead of per irqchip, so move
it into struct irq_common_data.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1433145945-789-13-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c    | 4 ++--
 kernel/irq/irqdesc.c | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6e40a9539763..a48e00e345d7 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -83,7 +83,7 @@ int irq_set_handler_data(unsigned int irq, void *data)
 
 	if (!desc)
 		return -EINVAL;
-	desc->irq_data.handler_data = data;
+	desc->irq_common_data.handler_data = data;
 	irq_put_desc_unlock(desc, flags);
 	return 0;
 }
@@ -796,7 +796,7 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
 		return;
 
 	__irq_do_set_handler(desc, handle, 1, NULL);
-	desc->irq_data.handler_data = data;
+	desc->irq_common_data.handler_data = data;
 
 	irq_put_desc_busunlock(desc, flags);
 }
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 7f3e9faa6e4d..594b3e349aac 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -72,11 +72,12 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 {
 	int cpu;
 
+	desc->irq_common_data.handler_data = NULL;
+
 	desc->irq_data.common = &desc->irq_common_data;
 	desc->irq_data.irq = irq;
 	desc->irq_data.chip = &no_irq_chip;
 	desc->irq_data.chip_data = NULL;
-	desc->irq_data.handler_data = NULL;
 	desc->irq_data.msi_desc = NULL;
 	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
-- 
cgit v1.2.3


From 9df872faa7e1619e9278bec00ceaed2236533530 Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Wed, 3 Jun 2015 11:47:50 +0800
Subject: genirq: Move field 'affinity' from irq_data into irq_common_data

Irq affinity mask is per-irq instead of per irqchip, so move it into
struct irq_common_data.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Link: http://lkml.kernel.org/r/1433303281-27688-1-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdesc.c |  9 +++++----
 kernel/irq/manage.c  | 12 ++++++------
 kernel/irq/proc.c    |  2 +-
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 594b3e349aac..bb48a5c1964e 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -38,12 +38,13 @@ static void __init init_irq_default_affinity(void)
 #ifdef CONFIG_SMP
 static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
 {
-	if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
+	if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity,
+				     gfp, node))
 		return -ENOMEM;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
-		free_cpumask_var(desc->irq_data.affinity);
+		free_cpumask_var(desc->irq_common_data.affinity);
 		return -ENOMEM;
 	}
 #endif
@@ -52,7 +53,7 @@ static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
 
 static void desc_smp_init(struct irq_desc *desc, int node)
 {
-	cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
+	cpumask_copy(desc->irq_common_data.affinity, irq_default_affinity);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_clear(desc->pending_mask);
 #endif
@@ -124,7 +125,7 @@ static void free_masks(struct irq_desc *desc)
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	free_cpumask_var(desc->pending_mask);
 #endif
-	free_cpumask_var(desc->irq_data.affinity);
+	free_cpumask_var(desc->irq_common_data.affinity);
 }
 #else
 static inline void free_masks(struct irq_desc *desc) { }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ad1b064f94fe..f9a59f6cabd2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -192,7 +192,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
 	switch (ret) {
 	case IRQ_SET_MASK_OK:
 	case IRQ_SET_MASK_OK_DONE:
-		cpumask_copy(data->affinity, mask);
+		cpumask_copy(desc->irq_common_data.affinity, mask);
 	case IRQ_SET_MASK_OK_NOCOPY:
 		irq_set_thread_affinity(desc);
 		ret = 0;
@@ -304,7 +304,7 @@ static void irq_affinity_notify(struct work_struct *work)
 	if (irq_move_pending(&desc->irq_data))
 		irq_get_pending(cpumask, desc);
 	else
-		cpumask_copy(cpumask, desc->irq_data.affinity);
+		cpumask_copy(cpumask, desc->irq_common_data.affinity);
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	notify->notify(notify, cpumask);
@@ -375,9 +375,9 @@ static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
 	 * one of the targets is online.
 	 */
 	if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
-		if (cpumask_intersects(desc->irq_data.affinity,
+		if (cpumask_intersects(desc->irq_common_data.affinity,
 				       cpu_online_mask))
-			set = desc->irq_data.affinity;
+			set = desc->irq_common_data.affinity;
 		else
 			irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
 	}
@@ -829,8 +829,8 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
 	 * This code is triggered unconditionally. Check the affinity
 	 * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
 	 */
-	if (desc->irq_data.affinity)
-		cpumask_copy(mask, desc->irq_data.affinity);
+	if (desc->irq_common_data.affinity)
+		cpumask_copy(mask, desc->irq_common_data.affinity);
 	else
 		valid = false;
 	raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0e97c142ce40..e3a8c9577ba6 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -39,7 +39,7 @@ static struct proc_dir_entry *root_irq_dir;
 static int show_irq_affinity(int type, struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
-	const struct cpumask *mask = desc->irq_data.affinity;
+	const struct cpumask *mask = desc->irq_common_data.affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (irqd_is_setaffinity_pending(&desc->irq_data))
-- 
cgit v1.2.3


From b237721c5d95082a803c0be686f56d2dd1de995b Mon Sep 17 00:00:00 2001
From: Jiang Liu <jiang.liu@linux.intel.com>
Date: Mon, 1 Jun 2015 16:05:43 +0800
Subject: genirq: Move field 'msi_desc' from irq_data into irq_common_data

MSI descriptors are per-irq instead of per irqchip, so move it into
struct irq_common_data.

Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Jason Cooper <jason@lakedaemon.net>
Cc: Kevin Cernekee <cernekee@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1433145945-789-35-git-send-email-jiang.liu@linux.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/chip.c    | 2 +-
 kernel/irq/irqdesc.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a48e00e345d7..8c55d545558f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -105,7 +105,7 @@ int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
 
 	if (!desc)
 		return -EINVAL;
-	desc->irq_data.msi_desc = entry;
+	desc->irq_common_data.msi_desc = entry;
 	if (entry && !irq_offset)
 		entry->irq = irq_base;
 	irq_put_desc_unlock(desc, flags);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index bb48a5c1964e..596669436f7a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -74,12 +74,12 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
 	int cpu;
 
 	desc->irq_common_data.handler_data = NULL;
+	desc->irq_common_data.msi_desc = NULL;
 
 	desc->irq_data.common = &desc->irq_common_data;
 	desc->irq_data.irq = irq;
 	desc->irq_data.chip = &no_irq_chip;
 	desc->irq_data.chip_data = NULL;
-	desc->irq_data.msi_desc = NULL;
 	irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
 	irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
 	desc->handle_irq = handle_bad_irq;
-- 
cgit v1.2.3


From bd0b9ac405e1794d72533c3d487aa65b6b955a0c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 14 Sep 2015 10:42:37 +0200
Subject: genirq: Remove irq argument from irq flow handlers

Most interrupt flow handlers do not use the irq argument. Those few
which use it can retrieve the irq number from the irq descriptor.

Remove the argument.

Search and replace was done with coccinelle and some extra helper
scripts around it. Thanks to Julia for her help!

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Julia Lawall <Julia.Lawall@lip6.fr>
Cc: Jiang Liu <jiang.liu@linux.intel.com>
---
 kernel/irq/chip.c    | 27 ++++++++-------------------
 kernel/irq/handle.c  |  4 +++-
 kernel/irq/irqdesc.c |  2 +-
 kernel/irq/resend.c  |  2 +-
 4 files changed, 13 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 8c55d545558f..e28169dd1c36 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -372,7 +372,6 @@ static bool irq_may_run(struct irq_desc *desc)
 
 /**
  *	handle_simple_irq - Simple and software-decoded IRQs.
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Simple interrupts are either sent from a demultiplexing interrupt
@@ -382,8 +381,7 @@ static bool irq_may_run(struct irq_desc *desc)
  *	Note: The caller is expected to handle the ack, clear, mask and
  *	unmask issues if necessary.
  */
-void
-handle_simple_irq(unsigned int irq, struct irq_desc *desc)
+void handle_simple_irq(struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
@@ -425,7 +423,6 @@ static void cond_unmask_irq(struct irq_desc *desc)
 
 /**
  *	handle_level_irq - Level type irq handler
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Level type interrupts are active as long as the hardware line has
@@ -433,8 +430,7 @@ static void cond_unmask_irq(struct irq_desc *desc)
  *	it after the associated handler has acknowledged the device, so the
  *	interrupt line is back to inactive.
  */
-void
-handle_level_irq(unsigned int irq, struct irq_desc *desc)
+void handle_level_irq(struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 	mask_ack_irq(desc);
@@ -496,7 +492,6 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
 
 /**
  *	handle_fasteoi_irq - irq handler for transparent controllers
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Only a single callback will be issued to the chip: an ->eoi()
@@ -504,8 +499,7 @@ static void cond_unmask_eoi_irq(struct irq_desc *desc, struct irq_chip *chip)
  *	for modern forms of interrupt handlers, which handle the flow
  *	details in hardware, transparently.
  */
-void
-handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
+void handle_fasteoi_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = desc->irq_data.chip;
 
@@ -546,7 +540,6 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
 
 /**
  *	handle_edge_irq - edge type IRQ handler
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Interrupt occures on the falling and/or rising edge of a hardware
@@ -560,8 +553,7 @@ EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
  *	the handler was running. If all pending interrupts are handled, the
  *	loop is left.
  */
-void
-handle_edge_irq(unsigned int irq, struct irq_desc *desc)
+void handle_edge_irq(struct irq_desc *desc)
 {
 	raw_spin_lock(&desc->lock);
 
@@ -618,13 +610,12 @@ EXPORT_SYMBOL(handle_edge_irq);
 #ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
 /**
  *	handle_edge_eoi_irq - edge eoi type IRQ handler
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  * Similar as the above handle_edge_irq, but using eoi and w/o the
  * mask/unmask logic.
  */
-void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+void handle_edge_eoi_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 
@@ -665,13 +656,11 @@ out_eoi:
 
 /**
  *	handle_percpu_irq - Per CPU local irq handler
- *	@irq:	the interrupt number
  *	@desc:	the interrupt description structure for this irq
  *
  *	Per CPU interrupts on SMP machines without locking requirements
  */
-void
-handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
+void handle_percpu_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 
@@ -688,7 +677,6 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 
 /**
  * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
- * @irq:	the interrupt number
  * @desc:	the interrupt description structure for this irq
  *
  * Per CPU interrupts on SMP machines without locking requirements. Same as
@@ -698,11 +686,12 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
  * contain the real device id for the cpu on which this handler is
  * called
  */
-void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+void handle_percpu_devid_irq(struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct irqaction *action = desc->action;
 	void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
+	unsigned int irq = irq_desc_get_irq(desc);
 	irqreturn_t res;
 
 	kstat_incr_irqs_this_cpu(desc);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b6eeea8a80c5..de41a68fc038 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -27,8 +27,10 @@
  *
  * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
  */
-void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
+void handle_bad_irq(struct irq_desc *desc)
 {
+	unsigned int irq = irq_desc_get_irq(desc);
+
 	print_irq_desc(irq, desc);
 	kstat_incr_irqs_this_cpu(desc);
 	ack_bad_irq(irq);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 596669436f7a..239e2ae2c947 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -347,7 +347,7 @@ int generic_handle_irq(unsigned int irq)
 
 	if (!desc)
 		return -EINVAL;
-	generic_handle_irq_desc(irq, desc);
+	generic_handle_irq_desc(desc);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(generic_handle_irq);
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index dd95f44f99b2..b86886beee4f 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -38,7 +38,7 @@ static void resend_irqs(unsigned long arg)
 		clear_bit(irq, irqs_resend);
 		desc = irq_to_desc(irq);
 		local_irq_disable();
-		desc->handle_irq(irq, desc);
+		desc->handle_irq(desc);
 		local_irq_enable();
 	}
 }
-- 
cgit v1.2.3


From f9f9e7b776142fb1c0782cade004cc8e0147a199 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Sep 2015 11:51:12 -0400
Subject: Revert "cgroup: simplify threadgroup locking"

This reverts commit b5ba75b5fc0e8404e2c50cb68f39bb6a53fc916f.

d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with
a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify
threadgroup locking") changed how cgroup synchronizes against task
fork and exits so that it uses global percpu_rwsem instead of
per-process rwsem; unfortunately, the write [un]lock paths of
percpu_rwsem always involve synchronize_rcu_expedited() which turned
out to be too expensive.

Improvements for percpu_rwsem are scheduled to be merged in the coming
v4.4-rc1 merge window which alleviates this issue.  For now, revert
the two commits to restore per-process rwsem.  They will be re-applied
for the v4.4-rc1 merge window.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com
Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: stable@vger.kernel.org # v4.2+
---
 kernel/cgroup.c | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2cf0f79f1fc9..115091efa889 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2460,13 +2460,14 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	if (!cgrp)
 		return -ENODEV;
 
-	percpu_down_write(&cgroup_threadgroup_rwsem);
+retry_find_task:
 	rcu_read_lock();
 	if (pid) {
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
+			rcu_read_unlock();
 			ret = -ESRCH;
-			goto out_unlock_rcu;
+			goto out_unlock_cgroup;
 		}
 	} else {
 		tsk = current;
@@ -2482,23 +2483,37 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	 */
 	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
-		goto out_unlock_rcu;
+		rcu_read_unlock();
+		goto out_unlock_cgroup;
 	}
 
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+	if (threadgroup) {
+		if (!thread_group_leader(tsk)) {
+			/*
+			 * a race with de_thread from another thread's exec()
+			 * may strip us of our leadership, if this happens,
+			 * there is no choice but to throw this task away and
+			 * try again; this is
+			 * "double-double-toil-and-trouble-check locking".
+			 */
+			percpu_up_write(&cgroup_threadgroup_rwsem);
+			put_task_struct(tsk);
+			goto retry_find_task;
+		}
+	}
+
 	ret = cgroup_procs_write_permission(tsk, cgrp, of);
 	if (!ret)
 		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 
-	put_task_struct(tsk);
-	goto out_unlock_threadgroup;
-
-out_unlock_rcu:
-	rcu_read_unlock();
-out_unlock_threadgroup:
 	percpu_up_write(&cgroup_threadgroup_rwsem);
+
+	put_task_struct(tsk);
+out_unlock_cgroup:
 	cgroup_kn_unlock(of->kn);
 	return ret ?: nbytes;
 }
@@ -2643,8 +2658,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-
 	/* look up all csses currently attached to @cgrp's subtree */
 	down_read(&css_set_rwsem);
 	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2700,8 +2713,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 				goto out_finish;
 			last_task = task;
 
+			percpu_down_write(&cgroup_threadgroup_rwsem);
+			/* raced against de_thread() from another thread? */
+			if (!thread_group_leader(task)) {
+				percpu_up_write(&cgroup_threadgroup_rwsem);
+				put_task_struct(task);
+				continue;
+			}
+
 			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
 
+			percpu_up_write(&cgroup_threadgroup_rwsem);
 			put_task_struct(task);
 
 			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2711,7 +2733,6 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
 out_finish:
 	cgroup_migrate_finish(&preloaded_csets);
-	percpu_up_write(&cgroup_threadgroup_rwsem);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0c986253b939cc14c69d4adbe2b4121bdf4aa220 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Sep 2015 11:51:12 -0400
Subject: Revert "sched, cgroup: replace signal_struct->group_rwsem with a
 global percpu_rwsem"

This reverts commit d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14.

d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with
a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify
threadgroup locking") changed how cgroup synchronizes against task
fork and exits so that it uses global percpu_rwsem instead of
per-process rwsem; unfortunately, the write [un]lock paths of
percpu_rwsem always involve synchronize_rcu_expedited() which turned
out to be too expensive.

Improvements for percpu_rwsem are scheduled to be merged in the coming
v4.4-rc1 merge window which alleviates this issue.  For now, revert
the two commits to restore per-process rwsem.  They will be re-applied
for the v4.4-rc1 merge window.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com
Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: stable@vger.kernel.org # v4.2+
---
 kernel/cgroup.c | 77 ++++++++++++++++++++++++++++++++++++++++++---------------
 kernel/fork.c   |  4 +++
 2 files changed, 61 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 115091efa889..2c9eae6ad970 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,7 +46,6 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
-#include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
@@ -104,8 +103,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 
-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
-
 #define cgroup_assert_mutex_or_rcu_locked()				\
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			   !lockdep_is_held(&cgroup_mutex),		\
@@ -874,6 +871,48 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
+void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+	down_read(&tsk->signal->group_rwsem);
+}
+
+void cgroup_threadgroup_change_end(struct task_struct *tsk)
+{
+	up_read(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_lock - lock threadgroup
+ * @tsk: member task of the threadgroup to lock
+ *
+ * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
+ * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
+ * change ->group_leader/pid.  This is useful for cases where the threadgroup
+ * needs to stay stable across blockable operations.
+ *
+ * fork and exit explicitly call threadgroup_change_{begin|end}() for
+ * synchronization.  While held, no new task will be added to threadgroup
+ * and no existing live task will have its PF_EXITING set.
+ *
+ * de_thread() does threadgroup_change_{begin|end}() when a non-leader
+ * sub-thread becomes a new leader.
+ */
+static void threadgroup_lock(struct task_struct *tsk)
+{
+	down_write(&tsk->signal->group_rwsem);
+}
+
+/**
+ * threadgroup_unlock - unlock threadgroup
+ * @tsk: member task of the threadgroup to unlock
+ *
+ * Reverse threadgroup_lock().
+ */
+static inline void threadgroup_unlock(struct task_struct *tsk)
+{
+	up_write(&tsk->signal->group_rwsem);
+}
+
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -2074,9 +2113,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	lockdep_assert_held(&css_set_rwsem);
 
 	/*
-	 * We are synchronized through cgroup_threadgroup_rwsem against
-	 * PF_EXITING setting such that we can't race against cgroup_exit()
-	 * changing the css_set to init_css_set and dropping the old one.
+	 * We are synchronized through threadgroup_lock() against PF_EXITING
+	 * setting such that we can't race against cgroup_exit() changing the
+	 * css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
@@ -2133,11 +2172,10 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * @src_cset and add it to @preloaded_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
- * This function may be called without holding cgroup_threadgroup_rwsem
- * even if the target is a process.  Threads may be created and destroyed
- * but as long as cgroup_mutex is not dropped, no new css_set can be put
- * into play and the preloaded css_sets are guaranteed to cover all
- * migrations.
+ * This function may be called without holding threadgroup_lock even if the
+ * target is a process.  Threads may be created and destroyed but as long
+ * as cgroup_mutex is not dropped, no new css_set can be put into play and
+ * the preloaded css_sets are guaranteed to cover all migrations.
  */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
 				   struct cgroup *dst_cgrp,
@@ -2240,7 +2278,7 @@ err:
  * @threadgroup: whether @leader points to the whole process or a single task
  *
  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding cgroup_threadgroup_rwsem.  The
+ * process, the caller must be holding threadgroup_lock of @leader.  The
  * caller is also responsible for invoking cgroup_migrate_add_src() and
  * cgroup_migrate_prepare_dst() on the targets before invoking this
  * function and following up with cgroup_migrate_finish().
@@ -2368,7 +2406,7 @@ out_release_tset:
  * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
- * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
+ * Call holding cgroup_mutex and threadgroup_lock of @leader.
  */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			      struct task_struct *leader, bool threadgroup)
@@ -2490,7 +2528,7 @@ retry_find_task:
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
-	percpu_down_write(&cgroup_threadgroup_rwsem);
+	threadgroup_lock(tsk);
 	if (threadgroup) {
 		if (!thread_group_leader(tsk)) {
 			/*
@@ -2500,7 +2538,7 @@ retry_find_task:
 			 * try again; this is
 			 * "double-double-toil-and-trouble-check locking".
 			 */
-			percpu_up_write(&cgroup_threadgroup_rwsem);
+			threadgroup_unlock(tsk);
 			put_task_struct(tsk);
 			goto retry_find_task;
 		}
@@ -2510,7 +2548,7 @@ retry_find_task:
 	if (!ret)
 		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 
-	percpu_up_write(&cgroup_threadgroup_rwsem);
+	threadgroup_unlock(tsk);
 
 	put_task_struct(tsk);
 out_unlock_cgroup:
@@ -2713,17 +2751,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 				goto out_finish;
 			last_task = task;
 
-			percpu_down_write(&cgroup_threadgroup_rwsem);
+			threadgroup_lock(task);
 			/* raced against de_thread() from another thread? */
 			if (!thread_group_leader(task)) {
-				percpu_up_write(&cgroup_threadgroup_rwsem);
+				threadgroup_unlock(task);
 				put_task_struct(task);
 				continue;
 			}
 
 			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
 
-			percpu_up_write(&cgroup_threadgroup_rwsem);
+			threadgroup_unlock(task);
 			put_task_struct(task);
 
 			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -5045,7 +5083,6 @@ int __init cgroup_init(void)
 	unsigned long key;
 	int ssid, err;
 
-	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d5f0f118a63..2845623fb582 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1149,6 +1149,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 	sched_autogroup_fork(sig);
 
+#ifdef CONFIG_CGROUPS
+	init_rwsem(&sig->group_rwsem);
+#endif
+
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 
-- 
cgit v1.2.3


From de9b8f5dcbd94bfb1d249907a635f1fb1968e19c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 13 Aug 2015 23:09:29 +0200
Subject: sched: Fix crash trying to dequeue/enqueue the idle thread

Sasha reports that his virtual machine tries to schedule the idle
thread since commit 6c37067e2786 ("sched: Change the
sched_class::set_cpus_allowed() calling context").

Hit trace shows this happening from idle_thread_get()->init_idle(),
which is the _second_ init_idle() invocation on that task_struct, the
first being done through idle_init()->fork_idle(). (this code is
insane...)

Because we call init_idle() twice in a row, its ->sched_class ==
&idle_sched_class and ->on_rq = TASK_ON_RQ_QUEUED. This means
do_set_cpus_allowed() think we're queued and will call dequeue_task(),
which is implemented with BUG() for the idle class, seeing how
dequeueing the idle task is a daft thing.

Aside of the whole insanity of calling init_idle() _twice_, change the
code to call set_cpus_allowed_common() instead as this is 'obviously'
before the idle task gets ran etc..

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 6c37067e2786 ("sched: Change the sched_class::set_cpus_allowed() calling context")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 97d276ff1edb..f0d043ec0182 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4927,7 +4927,15 @@ void init_idle(struct task_struct *idle, int cpu)
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 
-	do_set_cpus_allowed(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMP
+	/*
+	 * Its possible that init_idle() gets called multiple times on a task,
+	 * in that case do_set_cpus_allowed() will not do the right thing.
+	 *
+	 * And since this is boot we can forgo the serialization.
+	 */
+	set_cpus_allowed_common(idle, cpumask_of(cpu));
+#endif
 	/*
 	 * We're having a chicken and egg problem, even though we are
 	 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -4944,7 +4952,7 @@ void init_idle(struct task_struct *idle, int cpu)
 
 	rq->curr = rq->idle = idle;
 	idle->on_rq = TASK_ON_RQ_QUEUED;
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	idle->on_cpu = 1;
 #endif
 	raw_spin_unlock(&rq->lock);
@@ -4959,7 +4967,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	idle->sched_class = &idle_sched_class;
 	ftrace_graph_init_idle_task(idle, cpu);
 	vtime_init_idle(idle, cpu);
-#if defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
 	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
 }
-- 
cgit v1.2.3


From f55fc2a57cc9ca3b1bb4fb8eb25b6e1989e5b993 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 9 Sep 2015 19:06:33 +0200
Subject: perf: Restructure perf syscall point of no return

The exclusive_event_installable() stuff only works because its
exclusive with the grouping bits.

Rework the code such that there is a sane place to error out before we
go do things we cannot undo.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 49 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index f548f69c4299..39679f749500 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8297,13 +8297,30 @@ SYSCALL_DEFINE5(perf_event_open,
 
 	if (move_group) {
 		gctx = group_leader->ctx;
+		mutex_lock_double(&gctx->mutex, &ctx->mutex);
+	} else {
+		mutex_lock(&ctx->mutex);
+	}
+
+	/*
+	 * Must be under the same ctx::mutex as perf_install_in_context(),
+	 * because we need to serialize with concurrent event creation.
+	 */
+	if (!exclusive_event_installable(event, ctx)) {
+		/* exclusive and group stuff are assumed mutually exclusive */
+		WARN_ON_ONCE(move_group);
+
+		err = -EBUSY;
+		goto err_locked;
+	}
 
+	WARN_ON_ONCE(ctx->parent_ctx);
+
+	if (move_group) {
 		/*
 		 * See perf_event_ctx_lock() for comments on the details
 		 * of swizzling perf_event::ctx.
 		 */
-		mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
 		perf_remove_from_context(group_leader, false);
 
 		list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -8311,13 +8328,7 @@ SYSCALL_DEFINE5(perf_event_open,
 			perf_remove_from_context(sibling, false);
 			put_ctx(gctx);
 		}
-	} else {
-		mutex_lock(&ctx->mutex);
-	}
-
-	WARN_ON_ONCE(ctx->parent_ctx);
 
-	if (move_group) {
 		/*
 		 * Wait for everybody to stop referencing the events through
 		 * the old lists, before installing it on new lists.
@@ -8349,22 +8360,20 @@ SYSCALL_DEFINE5(perf_event_open,
 		perf_event__state_init(group_leader);
 		perf_install_in_context(ctx, group_leader, group_leader->cpu);
 		get_ctx(ctx);
-	}
 
-	if (!exclusive_event_installable(event, ctx)) {
-		err = -EBUSY;
-		mutex_unlock(&ctx->mutex);
-		fput(event_file);
-		goto err_context;
+		/*
+		 * Now that all events are installed in @ctx, nothing
+		 * references @gctx anymore, so drop the last reference we have
+		 * on it.
+		 */
+		put_ctx(gctx);
 	}
 
 	perf_install_in_context(ctx, event, event->cpu);
 	perf_unpin_context(ctx);
 
-	if (move_group) {
+	if (move_group)
 		mutex_unlock(&gctx->mutex);
-		put_ctx(gctx);
-	}
 	mutex_unlock(&ctx->mutex);
 
 	put_online_cpus();
@@ -8391,6 +8400,12 @@ SYSCALL_DEFINE5(perf_event_open,
 	fd_install(event_fd, event_file);
 	return event_fd;
 
+err_locked:
+	if (move_group)
+		mutex_unlock(&gctx->mutex);
+	mutex_unlock(&ctx->mutex);
+/* err_file: */
+	fput(event_file);
 err_context:
 	perf_unpin_context(ctx);
 	put_ctx(ctx);
-- 
cgit v1.2.3


From a723968c0ed36db676478c3d26078f13484fe01c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 9 Sep 2015 19:06:33 +0200
Subject: perf: Fix u16 overflows

Vince reported that its possible to overflow the various size fields
and get weird stuff if you stick too many events in a group.

Put a lid on this by requiring the fixed record size not exceed 16k.
This is still a fair amount of events (silly amount really) and leaves
plenty room for callchains and stack dwarves while also avoiding
overflowing the u16 variables.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 50 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 40 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 39679f749500..dbb5329b6a3a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1243,11 +1243,7 @@ static inline void perf_event__state_init(struct perf_event *event)
 					      PERF_EVENT_STATE_INACTIVE;
 }
 
-/*
- * Called at perf_event creation and when events are attached/detached from a
- * group.
- */
-static void perf_event__read_size(struct perf_event *event)
+static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
 {
 	int entry = sizeof(u64); /* value */
 	int size = 0;
@@ -1263,7 +1259,7 @@ static void perf_event__read_size(struct perf_event *event)
 		entry += sizeof(u64);
 
 	if (event->attr.read_format & PERF_FORMAT_GROUP) {
-		nr += event->group_leader->nr_siblings;
+		nr += nr_siblings;
 		size += sizeof(u64);
 	}
 
@@ -1271,14 +1267,11 @@ static void perf_event__read_size(struct perf_event *event)
 	event->read_size = size;
 }
 
-static void perf_event__header_size(struct perf_event *event)
+static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 {
 	struct perf_sample_data *data;
-	u64 sample_type = event->attr.sample_type;
 	u16 size = 0;
 
-	perf_event__read_size(event);
-
 	if (sample_type & PERF_SAMPLE_IP)
 		size += sizeof(data->ip);
 
@@ -1303,6 +1296,17 @@ static void perf_event__header_size(struct perf_event *event)
 	event->header_size = size;
 }
 
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__header_size(struct perf_event *event)
+{
+	__perf_event_read_size(event,
+			       event->group_leader->nr_siblings);
+	__perf_event_header_size(event, event->attr.sample_type);
+}
+
 static void perf_event__id_header_size(struct perf_event *event)
 {
 	struct perf_sample_data *data;
@@ -1330,6 +1334,27 @@ static void perf_event__id_header_size(struct perf_event *event)
 	event->id_header_size = size;
 }
 
+static bool perf_event_validate_size(struct perf_event *event)
+{
+	/*
+	 * The values computed here will be over-written when we actually
+	 * attach the event.
+	 */
+	__perf_event_read_size(event, event->group_leader->nr_siblings + 1);
+	__perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
+	perf_event__id_header_size(event);
+
+	/*
+	 * Sum the lot; should not exceed the 64k limit we have on records.
+	 * Conservative limit to allow for callchains and other variable fields.
+	 */
+	if (event->read_size + event->header_size +
+	    event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+		return false;
+
+	return true;
+}
+
 static void perf_group_attach(struct perf_event *event)
 {
 	struct perf_event *group_leader = event->group_leader, *pos;
@@ -8302,6 +8327,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		mutex_lock(&ctx->mutex);
 	}
 
+	if (!perf_event_validate_size(event)) {
+		err = -E2BIG;
+		goto err_locked;
+	}
+
 	/*
 	 * Must be under the same ctx::mutex as perf_install_in_context(),
 	 * because we need to serialize with concurrent event creation.
-- 
cgit v1.2.3


From f73e22ab450140830005581c2c7ec389791a1b8d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 9 Sep 2015 20:48:22 +0200
Subject: perf: Fix races in computing the header sizes

There are two races with the current code:

 - Another event can join the group and compute a larger header_size
   concurrently, if the smaller store wins we'll have an incorrect
   header_size set.

 - We compute the header_size after the event becomes active,
   therefore its possible to use the size before its computed.

Remedy the first by moving the computation inside the ctx::mutex lock,
and the second by placing it _before_ perf_install_in_context().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index dbb5329b6a3a..b11756f9b6dc 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8399,6 +8399,15 @@ SYSCALL_DEFINE5(perf_event_open,
 		put_ctx(gctx);
 	}
 
+	/*
+	 * Precalculate sample_data sizes; do while holding ctx::mutex such
+	 * that we're serialized against further additions and before
+	 * perf_install_in_context() which is the point the event is active and
+	 * can use these values.
+	 */
+	perf_event__header_size(event);
+	perf_event__id_header_size(event);
+
 	perf_install_in_context(ctx, event, event->cpu);
 	perf_unpin_context(ctx);
 
@@ -8414,12 +8423,6 @@ SYSCALL_DEFINE5(perf_event_open,
 	list_add_tail(&event->owner_entry, &current->perf_event_list);
 	mutex_unlock(&current->perf_event_mutex);
 
-	/*
-	 * Precalculate sample_data sizes
-	 */
-	perf_event__header_size(event);
-	perf_event__id_header_size(event);
-
 	/*
 	 * Drop the reference on the group_event after placing the
 	 * new event on the sibling_list. This ensures destruction
-- 
cgit v1.2.3


From 00cc1633816de8c95f337608a1ea64e228faf771 Mon Sep 17 00:00:00 2001
From: Dominik Dingel <dingel@linux.vnet.ibm.com>
Date: Fri, 18 Sep 2015 11:27:45 +0200
Subject: sched: access local runqueue directly in single_task_running

Commit 2ee507c47293 ("sched: Add function single_task_running to let a task
check if it is the only task running on a cpu") referenced the current
runqueue with the smp_processor_id.  When CONFIG_DEBUG_PREEMPT is enabled,
that is only allowed if preemption is disabled or the currrent task is
bound to the local cpu (e.g. kernel worker).

With commit f78195129963 ("kvm: add halt_poll_ns module parameter") KVM
calls single_task_running. If CONFIG_DEBUG_PREEMPT is enabled that
generates a lot of kernel messages.

To avoid adding preemption in that cases, as it would limit the usefulness,
we change single_task_running to access directly the cpu local runqueue.

Cc: Tim Chen <tim.c.chen@linux.intel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <stable@vger.kernel.org>
Fixes: 2ee507c472939db4b146d545352b8a7c79ef47f8
Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 kernel/sched/core.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3595403921bd..4064f794ab8c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2666,13 +2666,20 @@ unsigned long nr_running(void)
 
 /*
  * Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race.  The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
  */
 bool single_task_running(void)
 {
-	if (cpu_rq(smp_processor_id())->nr_running == 1)
-		return true;
-	else
-		return false;
+	return raw_rq()->nr_running == 1;
 }
 EXPORT_SYMBOL(single_task_running);
 
-- 
cgit v1.2.3


From 19a5ecde086a6a5287978b12ae948fa691b197b7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 20 Sep 2015 21:01:22 -0700
Subject: rcu: Suppress lockdep false positive for rcp->exp_funnel_mutex

In kernels built with CONFIG_PREEMPT=y, synchronize_rcu_expedited()
invokes synchronize_sched_expedited() while holding RCU-preempt's
root rcu_node structure's ->exp_funnel_mutex, which is acquired after
the rcu_data structure's ->exp_funnel_mutex.  The first thing that
synchronize_sched_expedited() will do is acquire RCU-sched's rcu_data
structure's ->exp_funnel_mutex.   There is no danger of an actual deadlock
because the locking order is always from RCU-preempt's expedited mutexes
to those of RCU-sched.  Unfortunately, lockdep considers both rcu_data
structures' ->exp_funnel_mutex to be in the same lock class and therefore
reports a deadlock cycle.

This commit silences this false positive by placing RCU-sched's rcu_data
structures' ->exp_funnel_mutex locks into their own lock class.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9f75f25cc5d9..775d36cc0050 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3868,6 +3868,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 static void __init
 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
+	static struct lock_class_key rcu_exp_sched_rdp_class;
 	unsigned long flags;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
@@ -3883,6 +3884,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 	mutex_init(&rdp->exp_funnel_mutex);
 	rcu_boot_init_nocb_percpu_data(rdp);
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	if (rsp == &rcu_sched_state)
+		lockdep_set_class_and_name(&rdp->exp_funnel_mutex,
+					   &rcu_exp_sched_rdp_class,
+					   "rcu_data_exp_sched");
 }
 
 /*
-- 
cgit v1.2.3


From ac5be6b47e8bd25b62bed2c82cda7398999f59e9 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 22 Sep 2015 14:58:49 -0700
Subject: userfaultfd: revert "userfaultfd: waitqueue: add nr wake parameter to
 __wake_up_locked_key"

This reverts commit 51360155eccb907ff8635bd10fc7de876408c2e0 and adapts
fs/userfaultfd.c to use the old version of that function.

It didn't look robust to call __wake_up_common with "nr == 1" when we
absolutely require wakeall semantics, but we've full control of what we
insert in the two waitqueue heads of the blocked userfaults.  No
exclusive waitqueue risks to be inserted into those two waitqueue heads
so we can as well stick to "nr == 1" of the old code and we can rely
purely on the fact no waitqueue inserted in one of the two waitqueue
heads we must enforce as wakeall, has wait->flags WQ_FLAG_EXCLUSIVE set.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dr. David Alan Gilbert <dgilbert@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thierry Reding <treding@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched/wait.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 272d9322bc5d..052e02672d12 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,10 +106,9 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked);
 
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
-			  void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
-	__wake_up_common(q, mode, nr, 0, key);
+	__wake_up_common(q, mode, 1, 0, key);
 }
 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
@@ -284,7 +283,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 	if (!list_empty(&wait->task_list))
 		list_del_init(&wait->task_list);
 	else if (waitqueue_active(q))
-		__wake_up_locked_key(q, mode, 1, key);
+		__wake_up_locked_key(q, mode, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
-- 
cgit v1.2.3


From 21199f27b430576552b26210b3194a363d7f05cd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 16 Sep 2015 16:10:40 +0200
Subject: locking/lockdep: Fix hlock->pin_count reset on lock stack rebuilds

Various people reported hitting the "unpinning an unpinned lock"
warning. As it turns out there are 2 places where we take a lock out
of the middle of a stack, and in those cases it would fail to preserve
the pin_count when rebuilding the lock stack.

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Reported-by: Tim Spriggs <tspriggs@apple.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davej@codemonkey.org.uk
Link: http://lkml.kernel.org/r/20150916141040.GA11639@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/locking/lockdep.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 8acfbf773e06..4e49cc4c9952 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3068,7 +3068,7 @@ static int __lock_is_held(struct lockdep_map *lock);
 static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 			  int trylock, int read, int check, int hardirqs_off,
 			  struct lockdep_map *nest_lock, unsigned long ip,
-			  int references)
+			  int references, int pin_count)
 {
 	struct task_struct *curr = current;
 	struct lock_class *class = NULL;
@@ -3157,7 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->waittime_stamp = 0;
 	hlock->holdtime_stamp = lockstat_clock();
 #endif
-	hlock->pin_count = 0;
+	hlock->pin_count = pin_count;
 
 	if (check && !mark_irqflags(curr, hlock))
 		return 0;
@@ -3343,7 +3343,7 @@ found_it:
 			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
 				hlock->nest_lock, hlock->acquire_ip,
-				hlock->references))
+				hlock->references, hlock->pin_count))
 			return 0;
 	}
 
@@ -3433,7 +3433,7 @@ found_it:
 			hlock_class(hlock)->subclass, hlock->trylock,
 				hlock->read, hlock->check, hlock->hardirqs_off,
 				hlock->nest_lock, hlock->acquire_ip,
-				hlock->references))
+				hlock->references, hlock->pin_count))
 			return 0;
 	}
 
@@ -3583,7 +3583,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	current->lockdep_recursion = 1;
 	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
 	__lock_acquire(lock, subclass, trylock, read, check,
-		       irqs_disabled_flags(flags), nest_lock, ip, 0);
+		       irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
 	current->lockdep_recursion = 0;
 	raw_local_irq_restore(flags);
 }
-- 
cgit v1.2.3


From a1b7febd725a2cdfc8ac245b7b7437ce4b91aecb Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Fri, 25 Sep 2015 18:09:32 +0200
Subject: genirq: Fix the documentation of request_percpu_irq

The documentation of request_percpu_irq is confusing and suggest that the
interrupt is not enabled at all, while it is actually enabled on the local
CPU.

Clarify that.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/irq/manage.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f9a59f6cabd2..a4360f0f62a5 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1790,9 +1790,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A percpu cookie passed back to the handler function
  *
- *	This call allocates interrupt resources, but doesn't
- *	automatically enable the interrupt. It has to be done on each
- *	CPU using enable_percpu_irq().
+ *	This call allocates interrupt resources and enables the
+ *	interrupt on the local CPU. If the interrupt is supposed to be
+ *	enabled on other CPUs, it has to be done on each CPU using
+ *	enable_percpu_irq().
  *
  *	Dev_id must be globally unique. It is a per-cpu variable, and
  *	the handler gets called with the interrupted CPU's instance of
-- 
cgit v1.2.3


From aec2e2ad1786eb07814ae60988e0e306cd24a6cc Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Fri, 25 Sep 2015 18:09:33 +0200
Subject: irq: Export per-cpu irq allocation and de-allocation functions

Some drivers might use the per-cpu interrupts and still might be built as a
module. Export request_percpu_irq an free_percpu_irq to these user, which
also make it consistent with enable/disable_percpu_irq that were exported.

Reported-by: Willy Tarreau <w@1wt.eu>
Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/irq/manage.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4360f0f62a5..4c213864ec45 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1761,6 +1761,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
 	kfree(__free_percpu_irq(irq, dev_id));
 	chip_bus_sync_unlock(desc);
 }
+EXPORT_SYMBOL_GPL(free_percpu_irq);
 
 /**
  *	setup_percpu_irq - setup a per-cpu interrupt
@@ -1832,6 +1833,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
 
 	return retval;
 }
+EXPORT_SYMBOL_GPL(request_percpu_irq);
 
 /**
  *	irq_get_irqchip_state - returns the irqchip state of a interrupt.
-- 
cgit v1.2.3


From a91263d520246b63c63e75ddfb072ee6a853fe15 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 30 Sep 2015 01:41:50 +0200
Subject: ebpf: migrate bpf_prog's flags to bitfield

As we need to add further flags to the bpf_prog structure, lets migrate
both bools to a bitfield representation. The size of the base structure
(excluding insns) remains unchanged at 40 bytes.

Add also tags for the kmemchecker, so that it doesn't throw false
positives. Even in case gcc would generate suboptimal code, it's not
being accessed in performance critical paths.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c    | 4 ++++
 kernel/bpf/syscall.c | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 67c380cfa9ca..c8855c2a7a48 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	if (fp == NULL)
 		return NULL;
 
+	kmemcheck_annotate_bitfield(fp, meta);
+
 	aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
 	if (aux == NULL) {
 		vfree(fp);
@@ -110,6 +112,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 
 	fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
 	if (fp != NULL) {
+		kmemcheck_annotate_bitfield(fp, meta);
+
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = size / PAGE_SIZE;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35bac8e8b071..2190ab14b763 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -553,10 +553,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 		goto free_prog;
 
 	prog->orig_prog = NULL;
-	prog->jited = false;
+	prog->jited = 0;
 
 	atomic_set(&prog->aux->refcnt, 1);
-	prog->gpl_compatible = is_gpl;
+	prog->gpl_compatible = is_gpl ? 1 : 0;
 
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
-- 
cgit v1.2.3


From c46646d0484f5d08e2bede9b45034ba5b8b489cc Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 30 Sep 2015 01:41:51 +0200
Subject: sched, bpf: add helper for retrieving routing realms

Using routing realms as part of the classifier is quite useful, it
can be viewed as a tag for one or multiple routing entries (think of
an analogy to net_cls cgroup for processes), set by user space routing
daemons or via iproute2 as an indicator for traffic classifiers and
later on processed in the eBPF program.

Unlike actions, the classifier can inspect device flags and enable
netif_keep_dst() if necessary. tc actions don't have that possibility,
but in case people know what they are doing, it can be used from there
as well (e.g. via devs that must keep dsts by design anyway).

If a realm is set, the handler returns the non-zero realm. User space
can set the full 32bit realm for the dst.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2190ab14b763..5f35f420c12f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -402,6 +402,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 			 */
 			BUG_ON(!prog->aux->ops->get_func_proto);
 
+			if (insn->imm == BPF_FUNC_get_route_realm)
+				prog->dst_needed = 1;
 			if (insn->imm == BPF_FUNC_tail_call) {
 				/* mark bpf_tail_call as different opcode
 				 * to avoid conditional branch in
-- 
cgit v1.2.3


From bab18991871545dfbd10c931eb0fe8f7637156a9 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 2 Oct 2015 15:17:33 +0200
Subject: bpf, seccomp: prepare for upcoming criu support

The current ongoing effort to dump existing cBPF seccomp filters back
to user space requires to hold the pre-transformed instructions like
we do in case of socket filters from sk_attach_filter() side, so they
can be reloaded in original form at a later point in time by utilities
such as criu.

To prepare for this, simply extend the bpf_prog_create_from_user()
API to hold a flag that tells whether we should store the original
or not. Also, fanout filters could make use of that in future for
things like diag. While fanout filters already use bpf_prog_destroy(),
move seccomp over to them as well to handle original programs when
present.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Tycho Andersen <tycho.andersen@canonical.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Tested-by: Tycho Andersen <tycho.andersen@canonical.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/seccomp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5bd4779282df..06858a74bb9c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -370,7 +370,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
 		return ERR_PTR(-ENOMEM);
 
 	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
-					seccomp_check_filter);
+					seccomp_check_filter, false);
 	if (ret < 0) {
 		kfree(sfilter);
 		return ERR_PTR(ret);
@@ -469,7 +469,7 @@ void get_seccomp_filter(struct task_struct *tsk)
 static inline void seccomp_filter_free(struct seccomp_filter *filter)
 {
 	if (filter) {
-		bpf_prog_free(filter->prog);
+		bpf_prog_destroy(filter->prog);
 		kfree(filter);
 	}
 }
-- 
cgit v1.2.3


From 0cdf5640e4f6940bdbbefee4bb0adb7dffb185ec Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 2 Oct 2015 18:42:00 +0200
Subject: ebpf: include perf_event only where really needed

Commit ea317b267e9d ("bpf: Add new bpf map type to store the pointer
to struct perf_event") added perf_event.h to the main eBPF header, so
it gets included for all users. perf_event.h is actually only needed
from array map side, so lets sanitize this a bit.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: Kaixu Xia <xiakaixu@huawei.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 29ace107f236..2fecc4aed119 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/filter.h>
+#include <linux/perf_event.h>
 
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
-- 
cgit v1.2.3


From 3ad0040573b0c00f88488bc31958acd07a55ee2e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 8 Oct 2015 01:20:39 +0200
Subject: bpf: split state from prandom_u32() and consolidate {c, e}BPF prngs

While recently arguing on a seccomp discussion that raw prandom_u32()
access shouldn't be exposed to unpriviledged user space, I forgot the
fact that SKF_AD_RANDOM extension actually already does it for some time
in cBPF via commit 4cd3675ebf74 ("filter: added BPF random opcode").

Since prandom_u32() is being used in a lot of critical networking code,
lets be more conservative and split their states. Furthermore, consolidate
eBPF and cBPF prandom handlers to use the new internal PRNG. For eBPF,
bpf_get_prandom_u32() was only accessible for priviledged users, but
should that change one day, we also don't want to leak raw sequences
through things like eBPF maps.

One thought was also to have own per bpf_prog states, but due to ABI
reasons this is not easily possible, i.e. the program code currently
cannot access bpf_prog itself, and copying the rnd_state to/from the
stack scratch space whenever a program uses the prng seems not really
worth the trouble and seems too hacky. If needed, taus113 could in such
cases be implemented within eBPF using a map entry to keep the state
space, or get_random_bytes() could become a second helper in cases where
performance would not be critical.

Both sides can trigger a one-time late init via prandom_init_once() on
the shared state. Performance-wise, there should even be a tiny gain
as bpf_user_rnd_u32() saves one function call. The PRNG needs to live
inside the BPF core since kernels could have a NET-less config as well.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Chema Gonzalez <chema@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/core.c    | 26 ++++++++++++++++++++++++++
 kernel/bpf/helpers.c |  7 +------
 kernel/bpf/syscall.c |  2 ++
 3 files changed, 29 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c8855c2a7a48..80864712d2c4 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -731,6 +731,32 @@ void bpf_prog_free(struct bpf_prog *fp)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_free);
 
+/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
+
+void bpf_user_rnd_init_once(void)
+{
+	prandom_init_once(&bpf_user_rnd_state);
+}
+
+u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	/* Should someone ever have the rather unwise idea to use some
+	 * of the registers passed into this function, then note that
+	 * this function is called from native eBPF and classic-to-eBPF
+	 * transformations. Register assignments from both sides are
+	 * different, f.e. classic always sets fn(ctx, A, X) here.
+	 */
+	struct rnd_state *state;
+	u32 res;
+
+	state = &get_cpu_var(bpf_user_rnd_state);
+	res = prandom_u32_state(state);
+	put_cpu_var(state);
+
+	return res;
+}
+
 /* Weak definitions of helper functions in case we don't have bpf syscall. */
 const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
 const struct bpf_func_proto bpf_map_update_elem_proto __weak;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec09421e..4504ca66118d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
 	.arg2_type	= ARG_PTR_TO_MAP_KEY,
 };
 
-static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
-	return prandom_u32();
-}
-
 const struct bpf_func_proto bpf_get_prandom_u32_proto = {
-	.func		= bpf_get_prandom_u32,
+	.func		= bpf_user_rnd_u32,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 };
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5f35f420c12f..c868cafbc00c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -404,6 +404,8 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
 
 			if (insn->imm == BPF_FUNC_get_route_realm)
 				prog->dst_needed = 1;
+			if (insn->imm == BPF_FUNC_get_prandom_u32)
+				bpf_user_rnd_init_once();
 			if (insn->imm == BPF_FUNC_tail_call) {
 				/* mark bpf_tail_call as different opcode
 				 * to avoid conditional branch in
-- 
cgit v1.2.3


From ff936a04e5f28b7e0455be0e7fa91334f89e4b44 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 7 Oct 2015 10:55:41 -0700
Subject: bpf: fix cb access in socket filter programs

eBPF socket filter programs may see junk in 'u32 cb[5]' area,
since it could have been used by protocol layers earlier.

For socket filter programs used in af_packet we need to clean
20 bytes of skb->cb area if it could be used by the program.
For programs attached to TCP/UDP sockets we need to save/restore
these 20 bytes, since it's used by protocol layers.

Remove SK_RUN_FILTER macro, since it's no longer used.

Long term we may move this bpf cb area to per-cpu scratch, but that
requires addition of new 'per-cpu load/store' instructions,
so not suitable as a short term fix.

Fixes: d691f9e8d440 ("bpf: allow programs to write to certain skb fields")
Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/verifier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b074b23000d6..f8da034c2258 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2024,7 +2024,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
 
 		cnt = env->prog->aux->ops->
 			convert_ctx_access(type, insn->dst_reg, insn->src_reg,
-					   insn->off, insn_buf);
+					   insn->off, insn_buf, env->prog);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
 			verbose("bpf verifier is misconfigured\n");
 			return -EINVAL;
-- 
cgit v1.2.3


From 1be7f75d1668d6296b80bf35dcf6762393530afc Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 7 Oct 2015 22:23:21 -0700
Subject: bpf: enable non-root eBPF programs

In order to let unprivileged users load and execute eBPF programs
teach verifier to prevent pointer leaks.
Verifier will prevent
- any arithmetic on pointers
  (except R10+Imm which is used to compute stack addresses)
- comparison of pointers
  (except if (map_value_ptr == 0) ... )
- passing pointers to helper functions
- indirectly passing pointers in stack to helper functions
- returning pointer from bpf program
- storing pointers into ctx or maps

Spill/fill of pointers into stack is allowed, but mangling
of pointers stored in the stack or reading them byte by byte is not.

Within bpf programs the pointers do exist, since programs need to
be able to access maps, pass skb pointer to LD_ABS insns, etc
but programs cannot pass such pointer values to the outside
or obfuscate them.

Only allow BPF_PROG_TYPE_SOCKET_FILTER unprivileged programs,
so that socket filters (tcpdump), af_packet (quic acceleration)
and future kcm can use it.
tracing and tc cls/act program types still require root permissions,
since tracing actually needs to be able to see all kernel pointers
and tc is for root only.

For example, the following unprivileged socket filter program is allowed:
int bpf_prog1(struct __sk_buff *skb)
{
  u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
  u64 *value = bpf_map_lookup_elem(&my_map, &index);

  if (value)
	*value += skb->len;
  return 0;
}

but the following program is not:
int bpf_prog1(struct __sk_buff *skb)
{
  u32 index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
  u64 *value = bpf_map_lookup_elem(&my_map, &index);

  if (value)
	*value += (u64) skb;
  return 0;
}
since it would leak the kernel address into the map.

Unprivileged socket filter bpf programs have access to the
following helper functions:
- map lookup/update/delete (but they cannot store kernel pointers into them)
- get_random (it's already exposed to unprivileged user space)
- get_smp_processor_id
- tail_call into another socket filter program
- ktime_get_ns

The feature is controlled by sysctl kernel.unprivileged_bpf_disabled.
This toggle defaults to off (0), but can be set true (1).  Once true,
bpf programs and maps cannot be accessed from unprivileged process,
and the toggle cannot be set back to false.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/syscall.c  |  11 +++---
 kernel/bpf/verifier.c | 106 +++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sysctl.c       |  13 +++++++
 3 files changed, 116 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c868cafbc00c..83697bc8e574 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
 #include <linux/filter.h>
 #include <linux/version.h>
 
+int sysctl_unprivileged_bpf_disabled __read_mostly;
+
 static LIST_HEAD(bpf_map_types);
 
 static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -544,6 +546,9 @@ static int bpf_prog_load(union bpf_attr *attr)
 	    attr->kern_version != LINUX_VERSION_CODE)
 		return -EINVAL;
 
+	if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	/* plain bpf_prog allocation */
 	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
 	if (!prog)
@@ -599,11 +604,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	union bpf_attr attr = {};
 	int err;
 
-	/* the syscall is limited to root temporarily. This restriction will be
-	 * lifted when security audit is clean. Note that eBPF+tracing must have
-	 * this restriction, since it may pass kernel data to user space
-	 */
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
 		return -EPERM;
 
 	if (!access_ok(VERIFY_READ, uattr, 1))
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f8da034c2258..1d6b97be79e1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
 	struct verifier_state_list **explored_states; /* search pruning optimization */
 	struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
 	u32 used_map_cnt;		/* number of used maps */
+	bool allow_ptr_leaks;
 };
 
 /* verbose verifier prints what it's seeing
@@ -538,6 +539,21 @@ static int bpf_size_to_bytes(int bpf_size)
 		return -EINVAL;
 }
 
+static bool is_spillable_regtype(enum bpf_reg_type type)
+{
+	switch (type) {
+	case PTR_TO_MAP_VALUE:
+	case PTR_TO_MAP_VALUE_OR_NULL:
+	case PTR_TO_STACK:
+	case PTR_TO_CTX:
+	case FRAME_PTR:
+	case CONST_PTR_TO_MAP:
+		return true;
+	default:
+		return false;
+	}
+}
+
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
@@ -550,9 +566,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
 	 */
 
 	if (value_regno >= 0 &&
-	    (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
-	     state->regs[value_regno].type == PTR_TO_STACK ||
-	     state->regs[value_regno].type == PTR_TO_CTX)) {
+	    is_spillable_regtype(state->regs[value_regno].type)) {
 
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
@@ -643,6 +657,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
 	return -EACCES;
 }
 
+static bool is_pointer_value(struct verifier_env *env, int regno)
+{
+	if (env->allow_ptr_leaks)
+		return false;
+
+	switch (env->cur_state.regs[regno].type) {
+	case UNKNOWN_VALUE:
+	case CONST_IMM:
+		return false;
+	default:
+		return true;
+	}
+}
+
 /* check whether memory at (regno + off) is accessible for t = (read | write)
  * if t==write, value_regno is a register which value is stored into memory
  * if t==read, value_regno is a register which will receive the value from memory
@@ -669,11 +697,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 	}
 
 	if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+		if (t == BPF_WRITE && value_regno >= 0 &&
+		    is_pointer_value(env, value_regno)) {
+			verbose("R%d leaks addr into map\n", value_regno);
+			return -EACCES;
+		}
 		err = check_map_access(env, regno, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
 
 	} else if (state->regs[regno].type == PTR_TO_CTX) {
+		if (t == BPF_WRITE && value_regno >= 0 &&
+		    is_pointer_value(env, value_regno)) {
+			verbose("R%d leaks addr into ctx\n", value_regno);
+			return -EACCES;
+		}
 		err = check_ctx_access(env, off, size, t);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown_value(state->regs, value_regno);
@@ -684,10 +722,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
 			verbose("invalid stack off=%d size=%d\n", off, size);
 			return -EACCES;
 		}
-		if (t == BPF_WRITE)
+		if (t == BPF_WRITE) {
+			if (!env->allow_ptr_leaks &&
+			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
+			    size != BPF_REG_SIZE) {
+				verbose("attempt to corrupt spilled pointer on stack\n");
+				return -EACCES;
+			}
 			err = check_stack_write(state, off, size, value_regno);
-		else
+		} else {
 			err = check_stack_read(state, off, size, value_regno);
+		}
 	} else {
 		verbose("R%d invalid mem access '%s'\n",
 			regno, reg_type_str[state->regs[regno].type]);
@@ -775,8 +820,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
 		return -EACCES;
 	}
 
-	if (arg_type == ARG_ANYTHING)
+	if (arg_type == ARG_ANYTHING) {
+		if (is_pointer_value(env, regno)) {
+			verbose("R%d leaks addr into helper function\n", regno);
+			return -EACCES;
+		}
 		return 0;
+	}
 
 	if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -950,8 +1000,9 @@ static int check_call(struct verifier_env *env, int func_id)
 }
 
 /* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
 {
+	struct reg_state *regs = env->cur_state.regs;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -976,6 +1027,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
 		if (err)
 			return err;
 
+		if (is_pointer_value(env, insn->dst_reg)) {
+			verbose("R%d pointer arithmetic prohibited\n",
+				insn->dst_reg);
+			return -EACCES;
+		}
+
 		/* check dest operand */
 		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
 		if (err)
@@ -1012,6 +1069,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
 				 */
 				regs[insn->dst_reg] = regs[insn->src_reg];
 			} else {
+				if (is_pointer_value(env, insn->src_reg)) {
+					verbose("R%d partial copy of pointer\n",
+						insn->src_reg);
+					return -EACCES;
+				}
 				regs[insn->dst_reg].type = UNKNOWN_VALUE;
 				regs[insn->dst_reg].map_ptr = NULL;
 			}
@@ -1061,8 +1123,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
 		/* pattern match 'bpf_add Rx, imm' instruction */
 		if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
 		    regs[insn->dst_reg].type == FRAME_PTR &&
-		    BPF_SRC(insn->code) == BPF_K)
+		    BPF_SRC(insn->code) == BPF_K) {
 			stack_relative = true;
+		} else if (is_pointer_value(env, insn->dst_reg)) {
+			verbose("R%d pointer arithmetic prohibited\n",
+				insn->dst_reg);
+			return -EACCES;
+		} else if (BPF_SRC(insn->code) == BPF_X &&
+			   is_pointer_value(env, insn->src_reg)) {
+			verbose("R%d pointer arithmetic prohibited\n",
+				insn->src_reg);
+			return -EACCES;
+		}
 
 		/* check dest operand */
 		err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1101,6 +1173,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
 		err = check_reg_arg(regs, insn->src_reg, SRC_OP);
 		if (err)
 			return err;
+
+		if (is_pointer_value(env, insn->src_reg)) {
+			verbose("R%d pointer comparison prohibited\n",
+				insn->src_reg);
+			return -EACCES;
+		}
 	} else {
 		if (insn->src_reg != BPF_REG_0) {
 			verbose("BPF_JMP uses reserved fields\n");
@@ -1155,6 +1233,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
 			regs[insn->dst_reg].type = CONST_IMM;
 			regs[insn->dst_reg].imm = 0;
 		}
+	} else if (is_pointer_value(env, insn->dst_reg)) {
+		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+		return -EACCES;
 	} else if (BPF_SRC(insn->code) == BPF_K &&
 		   (opcode == BPF_JEQ || opcode == BPF_JNE)) {
 
@@ -1658,7 +1739,7 @@ static int do_check(struct verifier_env *env)
 		}
 
 		if (class == BPF_ALU || class == BPF_ALU64) {
-			err = check_alu_op(regs, insn);
+			err = check_alu_op(env, insn);
 			if (err)
 				return err;
 
@@ -1816,6 +1897,11 @@ static int do_check(struct verifier_env *env)
 				if (err)
 					return err;
 
+				if (is_pointer_value(env, BPF_REG_0)) {
+					verbose("R0 leaks addr as return value\n");
+					return -EACCES;
+				}
+
 process_bpf_exit:
 				insn_idx = pop_stack(env, &prev_insn_idx);
 				if (insn_idx < 0) {
@@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (ret < 0)
 		goto skip_full_check;
 
+	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
 	ret = do_check(env);
 
 skip_full_check:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d8094e..96c856b04081 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
+#include <linux/bpf.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -1138,6 +1139,18 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= timer_migration_handler,
 	},
+#endif
+#ifdef CONFIG_BPF_SYSCALL
+	{
+		.procname	= "unprivileged_bpf_disabled",
+		.data		= &sysctl_unprivileged_bpf_disabled,
+		.maxlen		= sizeof(sysctl_unprivileged_bpf_disabled),
+		.mode		= 0644,
+		/* only handle a transition from default "0" to "1" */
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
 #endif
 	{ }
 };
-- 
cgit v1.2.3


From aaac3ba95e4c8b496d22f68bd1bc01cfbf525eca Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Wed, 7 Oct 2015 22:23:22 -0700
Subject: bpf: charge user for creation of BPF maps and programs

since eBPF programs and maps use kernel memory consider it 'locked' memory
from user accounting point of view and charge it against RLIMIT_MEMLOCK limit.
This limit is typically set to 64Kbytes by distros, so almost all
bpf+tracing programs would need to increase it, since they use maps,
but kernel charges maximum map size upfront.
For example the hash map of 1024 elements will be charged as 64Kbyte.
It's inconvenient for current users and changes current behavior for root,
but probably worth doing to be consistent root vs non-root.

Similar accounting logic is done by mmap of perf_event.

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/bpf/arraymap.c |  2 +-
 kernel/bpf/hashtab.c  |  4 ++++
 kernel/bpf/syscall.c  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 68 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 2fecc4aed119..f2d9e698c753 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -49,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	array->map.key_size = attr->key_size;
 	array->map.value_size = attr->value_size;
 	array->map.max_entries = attr->max_entries;
-
+	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
 	array->elem_size = elem_size;
 
 	return &array->map;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d9b17a..28592d79502b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -88,6 +88,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	htab->elem_size = sizeof(struct htab_elem) +
 			  round_up(htab->map.key_size, 8) +
 			  htab->map.value_size;
+
+	htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+				   htab->elem_size * htab->map.max_entries,
+				   PAGE_SIZE) >> PAGE_SHIFT;
 	return &htab->map;
 
 free_htab:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 83697bc8e574..f640e5f7afbd 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -46,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
 	list_add(&tl->list_node, &bpf_map_types);
 }
 
+static int bpf_map_charge_memlock(struct bpf_map *map)
+{
+	struct user_struct *user = get_current_user();
+	unsigned long memlock_limit;
+
+	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	atomic_long_add(map->pages, &user->locked_vm);
+
+	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+		atomic_long_sub(map->pages, &user->locked_vm);
+		free_uid(user);
+		return -EPERM;
+	}
+	map->user = user;
+	return 0;
+}
+
+static void bpf_map_uncharge_memlock(struct bpf_map *map)
+{
+	struct user_struct *user = map->user;
+
+	atomic_long_sub(map->pages, &user->locked_vm);
+	free_uid(user);
+}
+
 /* called from workqueue */
 static void bpf_map_free_deferred(struct work_struct *work)
 {
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
+	bpf_map_uncharge_memlock(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -110,6 +137,10 @@ static int map_create(union bpf_attr *attr)
 
 	atomic_set(&map->refcnt, 1);
 
+	err = bpf_map_charge_memlock(map);
+	if (err)
+		goto free_map;
+
 	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
 
 	if (err < 0)
@@ -442,11 +473,37 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 	kfree(aux->used_maps);
 }
 
+static int bpf_prog_charge_memlock(struct bpf_prog *prog)
+{
+	struct user_struct *user = get_current_user();
+	unsigned long memlock_limit;
+
+	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	atomic_long_add(prog->pages, &user->locked_vm);
+	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+		atomic_long_sub(prog->pages, &user->locked_vm);
+		free_uid(user);
+		return -EPERM;
+	}
+	prog->aux->user = user;
+	return 0;
+}
+
+static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
+{
+	struct user_struct *user = prog->aux->user;
+
+	atomic_long_sub(prog->pages, &user->locked_vm);
+	free_uid(user);
+}
+
 static void __prog_put_rcu(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
 
 	free_used_maps(aux);
+	bpf_prog_uncharge_memlock(aux->prog);
 	bpf_prog_free(aux->prog);
 }
 
@@ -554,6 +611,10 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (!prog)
 		return -ENOMEM;
 
+	err = bpf_prog_charge_memlock(prog);
+	if (err)
+		goto free_prog_nouncharge;
+
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
@@ -595,6 +656,8 @@ static int bpf_prog_load(union bpf_attr *attr)
 free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
+	bpf_prog_uncharge_memlock(prog);
+free_prog_nouncharge:
 	bpf_prog_free(prog);
 	return err;
 }
-- 
cgit v1.2.3