From 9a9686b634acc5cb6b7c601c171ae64af0318a24 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 22 Apr 2010 17:29:24 +0800
Subject: cgroup: Fix an RCU warning in cgroup_path()

with CONFIG_PROVE_RCU=y, a warning can be triggered:

  # mount -t cgroup -o debug xxx /mnt
  # cat /proc/$$/cgroup

...
kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection!
...

This is a false-positive, because cgroup_path() can be called
with either rcu_read_lock() held or cgroup_mutex held.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/cgroup.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e13980c..4ca928db890c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1646,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
 	char *start;
-	struct dentry *dentry = rcu_dereference(cgrp->dentry);
+	struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
+						      rcu_read_lock_held() ||
+						      cgroup_lock_is_held());
 
 	if (!dentry || cgrp == dummytop) {
 		/*
@@ -1662,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 	*--start = '\0';
 	for (;;) {
 		int len = dentry->d_name.len;
+
 		if ((start -= len) < buf)
 			return -ENAMETOOLONG;
-		memcpy(start, cgrp->dentry->d_name.name, len);
+		memcpy(start, dentry->d_name.name, len);
 		cgrp = cgrp->parent;
 		if (!cgrp)
 			break;
-		dentry = rcu_dereference(cgrp->dentry);
+
+		dentry = rcu_dereference_check(cgrp->dentry,
+					       rcu_read_lock_held() ||
+					       cgroup_lock_is_held());
 		if (!cgrp->parent)
 			continue;
 		if (--start < buf)
-- 
cgit v1.2.3


From fae9c791703606636c1220e47f6690660042ce7f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 22 Apr 2010 17:30:00 +0800
Subject: cgroup: Fix an RCU warning in alloc_css_id()

With CONFIG_PROVE_RCU=y, a warning can be triggered:

  # mount -t cgroup -o memory xxx /mnt
  # mkdir /mnt/0

...
kernel/cgroup.c:4442 invoked rcu_dereference_check() without protection!
...

This is a false-positive. It's safe to directly access parent_css->id.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4ca928db890c..3a53c771e503 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4561,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
 {
 	int subsys_id, i, depth = 0;
 	struct cgroup_subsys_state *parent_css, *child_css;
-	struct css_id *child_id, *parent_id = NULL;
+	struct css_id *child_id, *parent_id;
 
 	subsys_id = ss->subsys_id;
 	parent_css = parent->subsys[subsys_id];
 	child_css = child->subsys[subsys_id];
-	depth = css_depth(parent_css) + 1;
 	parent_id = parent_css->id;
+	depth = parent_id->depth;
 
 	child_id = get_new_cssid(ss, depth);
 	if (IS_ERR(child_id))
-- 
cgit v1.2.3


From b629317e66fb1c6066c550dded45ab85a936163c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 22 Apr 2010 17:30:40 +0800
Subject: sched: Fix an RCU warning in print_task()

With CONFIG_PROVE_RCU=y, a warning can be triggered:

  $ cat /proc/sched_debug

...
kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection!
...

Both cgroup_path() and task_group() should be called with either
rcu_read_lock or cgroup_mutex held.

The rcu_dereference_check() does include cgroup_lock_is_held(), so we
know that this lock is not held.  Therefore, in a CONFIG_PREEMPT kernel,
to say nothing of a CONFIG_PREEMPT_RT kernel, the original code could
have ended up copying a string out of the freelist.

This patch inserts RCU read-side primitives needed to prevent this
scenario.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/sched_debug.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9b49db144037..19be00ba6123 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 	{
 		char path[64];
 
+		rcu_read_lock();
 		cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
+		rcu_read_unlock();
 		SEQ_printf(m, " %s", path);
 	}
 #endif
-- 
cgit v1.2.3


From ee84b8243b07c33a5c8aed42b4b2da60cb16d1d2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 6 May 2010 09:28:41 -0700
Subject: rcu: create rcu_my_thread_group_empty() wrapper

Some RCU-lockdep splat repairs need to know whether they are running
in a single-threaded process.  Unfortunately, the thread_group_empty()
primitive is defined in sched.h, and can induce #include hell.  This
commit therefore introduces a rcu_my_thread_group_empty() wrapper that
is defined in rcupdate.c, thus avoiding the need to include sched.h
everywhere.

Signed-off-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
---
 kernel/rcupdate.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 03a7ea1579f6..49d808e833b0 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -122,3 +122,14 @@ void wakeme_after_rcu(struct rcu_head  *head)
 	rcu = container_of(head, struct rcu_synchronize, head);
 	complete(&rcu->completion);
 }
+
+#ifdef CONFIG_PROVE_RCU
+/*
+ * wrapper function to avoid #include problems.
+ */
+int rcu_my_thread_group_empty(void)
+{
+	return thread_group_empty(current);
+}
+EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
+#endif /* #ifdef CONFIG_PROVE_RCU */
-- 
cgit v1.2.3


From 34441427aab4bdb3069a4ffcda69a99357abcb2e Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Tue, 11 May 2010 14:06:46 -0700
Subject: revert "procfs: provide stack information for threads" and its fixup
 commits

Originally, commit d899bf7b ("procfs: provide stack information for
threads") attempted to introduce a new feature for showing where the
threadstack was located and how many pages are being utilized by the
stack.

Commit c44972f1 ("procfs: disable per-task stack usage on NOMMU") was
applied to fix the NO_MMU case.

Commit 89240ba0 ("x86, fs: Fix x86 procfs stack information for threads on
64-bit") was applied to fix a bug in ia32 executables being loaded.

Commit 9ebd4eba7 ("procfs: fix /proc/<pid>/stat stack pointer for kernel
threads") was applied to fix a bug which had kernel threads printing a
userland stack address.

Commit 1306d603f ('proc: partially revert "procfs: provide stack
information for threads"') was then applied to revert the stack pages
being used to solve a significant performance regression.

This patch nearly undoes the effect of all these patches.

The reason for reverting these is it provides an unusable value in
field 28.  For x86_64, a fork will result in the task->stack_start
value being updated to the current user top of stack and not the stack
start address.  This unpredictability of the stack_start value makes
it worthless.  That includes the intended use of showing how much stack
space a thread has.

Other architectures will get different values.  As an example, ia64
gets 0.  The do_fork() and copy_process() functions appear to treat the
stack_start and stack_size parameters as architecture specific.

I only partially reverted c44972f1 ("procfs: disable per-task stack usage
on NOMMU") .  If I had completely reverted it, I would have had to change
mm/Makefile only build pagewalk.o when CONFIG_PROC_PAGE_MONITOR is
configured.  Since I could not test the builds without significant effort,
I decided to not change mm/Makefile.

I only partially reverted 89240ba0 ("x86, fs: Fix x86 procfs stack
information for threads on 64-bit") .  I left the KSTK_ESP() change in
place as that seemed worthwhile.

Signed-off-by: Robin Holt <holt@sgi.com>
Cc: Stefani Seibold <stefani@seibold.net>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 44b0791b0a2e..4c14942a0ee3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1114,8 +1114,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->bts = NULL;
 
-	p->stack_start = stack_start;
-
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
 
-- 
cgit v1.2.3


From 475f9aa6aa538befcbd0fa95bdebada600f247cd Mon Sep 17 00:00:00 2001
From: Vitaly Mayatskikh <v.mayatskih@gmail.com>
Date: Tue, 11 May 2010 14:06:51 -0700
Subject: kexec: fix OOPS in crash_kernel_shrink

Two "echo 0 > /sys/kernel/kexec_crash_size" OOPSes kernel.  Also content
of this file is invalid after first shrink to zero: it shows 1 instead of
0.

This scenario is unlikely to happen often (root privs, valid crashkernel=
in cmdline, dump-capture kernel not loaded), I hit it only by chance.

This patch fixes it.

Signed-off-by: Vitaly Mayatskikh <v.mayatskih@gmail.com>
Cc: Cong Wang <amwang@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87ebe8adc474..474a84715eac 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1134,11 +1134,9 @@ int crash_shrink_memory(unsigned long new_size)
 
 	free_reserved_phys_range(end, crashk_res.end);
 
-	if (start == end) {
-		crashk_res.end = end;
+	if (start == end)
 		release_resource(&crashk_res);
-	} else
-		crashk_res.end = end - 1;
+	crashk_res.end = end - 1;
 
 unlock:
 	mutex_unlock(&kexec_mutex);
-- 
cgit v1.2.3


From 11cad320a4f4bc53d3585c85600c782faa12b99e Mon Sep 17 00:00:00 2001
From: Vitaliy Gusev <vgusev@openvz.org>
Date: Tue, 11 May 2010 14:06:56 -0700
Subject: bsdacct: use del_timer_sync() in acct_exit_ns()

acct_exit_ns --> acct_file_reopen deletes timer without check timer
execution on other CPUs.  So acct_timeout() can change an unmapped memory.

Signed-off-by: Vitaliy Gusev <vgusev@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/acct.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 24f8c81fc48d..e4c0e1fee9b0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -353,17 +353,18 @@ restart:
 
 void acct_exit_ns(struct pid_namespace *ns)
 {
-	struct bsd_acct_struct *acct;
+	struct bsd_acct_struct *acct = ns->bacct;
 
-	spin_lock(&acct_lock);
-	acct = ns->bacct;
-	if (acct != NULL) {
-		if (acct->file != NULL)
-			acct_file_reopen(acct, NULL, NULL);
+	if (acct == NULL)
+		return;
 
-		kfree(acct);
-	}
+	del_timer_sync(&acct->timer);
+	spin_lock(&acct_lock);
+	if (acct->file != NULL)
+		acct_file_reopen(acct, NULL, NULL);
 	spin_unlock(&acct_lock);
+
+	kfree(acct);
 }
 
 /*
-- 
cgit v1.2.3


From 7f0f15464185a92f9d8791ad231bcd7bf6df54e4 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 11 May 2010 14:06:58 -0700
Subject: memcg: fix css_id() RCU locking for real

Commit ad4ba375373937817404fd92239ef4cadbded23b ("memcg: css_id() must be
called under rcu_read_lock()") modifies memcontol.c for fixing RCU check
message.  But Andrew Morton pointed out that the fix doesn't seems sane
and it was just for hidining lockdep messages.

This is a patch for do proper things.  Checking again, all places,
accessing without rcu_read_lock, that commit fixies was intentional....
all callers of css_id() has reference count on it.  So, it's not necessary
to be under rcu_read_lock().

Considering again, we can use rcu_dereference_check for css_id().  We know
css->id is valid if css->refcnt > 0.  (css->id never changes and freed
after css->refcnt going to be 0.)

This patch makes use of rcu_dereference_check() in css_id/depth and remove
unnecessary rcu-read-lock added by the commit.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3a53c771e503..6db8b7f297a1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4435,7 +4435,15 @@ __setup("cgroup_disable=", cgroup_disable);
  */
 unsigned short css_id(struct cgroup_subsys_state *css)
 {
-	struct css_id *cssid = rcu_dereference(css->id);
+	struct css_id *cssid;
+
+	/*
+	 * This css_id() can return correct value when somone has refcnt
+	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
+	 * it's unchanged until freed.
+	 */
+	cssid = rcu_dereference_check(css->id,
+			rcu_read_lock_held() || atomic_read(&css->refcnt));
 
 	if (cssid)
 		return cssid->id;
@@ -4445,7 +4453,10 @@ EXPORT_SYMBOL_GPL(css_id);
 
 unsigned short css_depth(struct cgroup_subsys_state *css)
 {
-	struct css_id *cssid = rcu_dereference(css->id);
+	struct css_id *cssid;
+
+	cssid = rcu_dereference_check(css->id,
+			rcu_read_lock_held() || atomic_read(&css->refcnt));
 
 	if (cssid)
 		return cssid->depth;
-- 
cgit v1.2.3


From 747388d78a0ae768fd82b55c4ed38aa646a72364 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 11 May 2010 14:06:59 -0700
Subject: memcg: fix css_is_ancestor() RCU locking

Some callers (in memcontrol.c) calls css_is_ancestor() without
rcu_read_lock.  Because css_is_ancestor() has to access RCU protected
data, it should be under rcu_read_lock().

This makes css_is_ancestor() itself does safe access to RCU protected
area.  (At least, "root" can have refcnt==0 if it's not an ancestor of
"child".  So, we need rcu_read_lock().)

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6db8b7f297a1..6d870f2d1228 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4464,15 +4464,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 }
 EXPORT_SYMBOL_GPL(css_depth);
 
+/**
+ *  css_is_ancestor - test "root" css is an ancestor of "child"
+ * @child: the css to be tested.
+ * @root: the css supporsed to be an ancestor of the child.
+ *
+ * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
+ * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
+ * But, considering usual usage, the csses should be valid objects after test.
+ * Assuming that the caller will do some action to the child if this returns
+ * returns true, the caller must take "child";s reference count.
+ * If "child" is valid object and this returns true, "root" is valid, too.
+ */
+
 bool css_is_ancestor(struct cgroup_subsys_state *child,
 		    const struct cgroup_subsys_state *root)
 {
-	struct css_id *child_id = rcu_dereference(child->id);
-	struct css_id *root_id = rcu_dereference(root->id);
+	struct css_id *child_id;
+	struct css_id *root_id;
+	bool ret = true;
 
-	if (!child_id || !root_id || (child_id->depth < root_id->depth))
-		return false;
-	return child_id->stack[root_id->depth] == root_id->id;
+	rcu_read_lock();
+	child_id  = rcu_dereference(child->id);
+	root_id = rcu_dereference(root->id);
+	if (!child_id
+	    || !root_id
+	    || (child_id->depth < root_id->depth)
+	    || (child_id->stack[root_id->depth] != root_id->id))
+		ret = false;
+	rcu_read_unlock();
+	return ret;
 }
 
 static void __free_css_id_cb(struct rcu_head *head)
-- 
cgit v1.2.3


From 16a2164bb03612efe79a76c73da6da44445b9287 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 14 May 2010 19:44:10 -0700
Subject: profile: fix stats and data leakage

If the kernel is large or the profiling step small, /proc/profile
leaks data and readprofile shows silly stats, until readprofile -r
has reset the buffer: clear the prof_buffer when it is vmalloc()ed.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/profile.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..dfadc5b729f1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
 		return 0;
 
 	prof_buffer = vmalloc(buffer_bytes);
-	if (prof_buffer)
+	if (prof_buffer) {
+		memset(prof_buffer, 0, buffer_bytes);
 		return 0;
+	}
 
 	free_cpumask_var(prof_cpu_mask);
 	return -ENOMEM;
-- 
cgit v1.2.3