From c757249af152c59fd74b85e52e8c090acb33d9c0 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:40 -0700
Subject: [PATCH] per-task-delay-accounting: taskstats interface

Create a "taskstats" interface based on generic netlink (NETLINK_GENERIC
family), for getting statistics of tasks and thread groups during their
lifetime and when they exit.  The interface is intended for use by multiple
accounting packages though it is being created in the context of delay
accounting.

This patch creates the interface without populating the fields of the data
that is sent to the user in response to a command or upon the exit of a task.
Each accounting package interested in using taskstats has to provide an
additional patch to add its stats to the common structure.

[akpm@osdl.org: cleanups, Kconfig fix]
Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 336 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 336 insertions(+)
 create mode 100644 kernel/taskstats.c

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..82ec9137d908
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,336 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+static DEFINE_MUTEX(taskstats_exit_mutex);
+
+static struct genl_family family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= TASKSTATS_GENL_NAME,
+	.version	= TASKSTATS_GENL_VERSION,
+	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+};
+
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+			void **replyp, size_t size)
+{
+	struct sk_buff *skb;
+	void *reply;
+
+	/*
+	 * If new attributes are added, please revisit this allocation
+	 */
+	skb = nlmsg_new(size);
+	if (!skb)
+		return -ENOMEM;
+
+	if (!info) {
+		int seq = get_cpu_var(taskstats_seqnum)++;
+		put_cpu_var(taskstats_seqnum);
+
+		reply = genlmsg_put(skb, 0, seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	} else
+		reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	if (reply == NULL) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	*skbp = skb;
+	*replyp = reply;
+	return 0;
+}
+
+static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	void *reply;
+	int rc;
+
+	reply = genlmsg_data(genlhdr);
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	if (event == TASKSTATS_MSG_MULTICAST)
+		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
+	return genlmsg_unicast(skb, pid);
+}
+
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk = pidtsk;
+
+	if (!pidtsk) {
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_pid(pid);
+		if (!tsk) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+		get_task_struct(tsk);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(tsk);
+
+	/*
+	 * Each accounting subsystem adds calls to its functions to
+	 * fill in relevant parts of struct taskstsats as follows
+	 *
+	 *	rc = per-task-foo(stats, tsk);
+	 *	if (rc)
+	 *		goto err;
+	 */
+
+err:
+	put_task_struct(tsk);
+	return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk, *first;
+
+	first = tgidtsk;
+	read_lock(&tasklist_lock);
+	if (!first) {
+		first = find_task_by_pid(tgid);
+		if (!first) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+	}
+	tsk = first;
+	do {
+		/*
+		 * Each accounting subsystem adds calls its functions to
+		 * fill in relevant parts of struct taskstsats as follows
+		 *
+		 *	rc = per-task-foo(stats, tsk);
+		 *	if (rc)
+		 *		break;
+		 */
+
+	} while_each_thread(first, tsk);
+	read_unlock(&tasklist_lock);
+
+	/*
+	 * Accounting subsytems can also add calls here if they don't
+	 * wish to aggregate statistics for per-tgid stats
+	 */
+
+	return rc;
+}
+
+static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+{
+	int rc = 0;
+	struct sk_buff *rep_skb;
+	struct taskstats stats;
+	void *reply;
+	size_t size;
+	struct nlattr *na;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	memset(&stats, 0, sizeof(stats));
+	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		return rc;
+
+	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+		rc = fill_pid(pid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+		rc = fill_tgid(tgid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	nla_nest_end(rep_skb, na);
+
+	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+
+nla_put_failure:
+	return genlmsg_cancel(rep_skb, reply);
+err:
+	nlmsg_free(rep_skb);
+	return rc;
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+			struct taskstats *tgidstats)
+{
+	int rc;
+	struct sk_buff *rep_skb;
+	void *reply;
+	size_t size;
+	int is_thread_group;
+	struct nlattr *na;
+
+	if (!family_registered || !tidstats)
+		return;
+
+	mutex_lock(&taskstats_exit_mutex);
+
+	is_thread_group = !thread_group_empty(tsk);
+	rc = 0;
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	if (is_thread_group)
+		size = 2 * size;	/* PID + STATS + TGID + STATS */
+
+	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		goto ret;
+
+	rc = fill_pid(tsk->pid, tsk, tidstats);
+	if (rc < 0)
+		goto err_skb;
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tidstats);
+	nla_nest_end(rep_skb, na);
+
+	if (!is_thread_group || !tgidstats) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	rc = fill_tgid(tsk->pid, tsk, tgidstats);
+	/*
+	 * If fill_tgid() failed then one probable reason could be that the
+	 * thread group leader has exited. fill_tgid() will fail, send out
+	 * the pid statistics collected earlier.
+	 */
+	if (rc < 0) {
+		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+		goto ret;
+	}
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tgidstats);
+	nla_nest_end(rep_skb, na);
+
+	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	goto ret;
+
+nla_put_failure:
+	genlmsg_cancel(rep_skb, reply);
+	goto ret;
+err_skb:
+	nlmsg_free(rep_skb);
+ret:
+	mutex_unlock(&taskstats_exit_mutex);
+	return;
+}
+
+static struct genl_ops taskstats_ops = {
+	.cmd		= TASKSTATS_CMD_GET,
+	.doit		= taskstats_send_stats,
+	.policy		= taskstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+	taskstats_cache = kmem_cache_create("taskstats_cache",
+						sizeof(struct taskstats),
+						0, SLAB_PANIC, NULL, NULL);
+}
+
+static int __init taskstats_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&family);
+	if (rc)
+		return rc;
+
+	rc = genl_register_ops(&family, &taskstats_ops);
+	if (rc < 0)
+		goto err;
+
+	family_registered = 1;
+	return 0;
+err:
+	genl_unregister_family(&family);
+	return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
-- 
cgit v1.2.3


From 6f44993fe1d7b2b097f6ac60cd5835c6f5ca0874 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:41 -0700
Subject: [PATCH] per-task-delay-accounting: delay accounting usage of
 taskstats interface

Usage of taskstats interface by delay accounting.

Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 82ec9137d908..ea9506de3b85 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -18,13 +18,13 @@
 
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
-static DEFINE_MUTEX(taskstats_exit_mutex);
 
 static struct genl_family family = {
 	.id		= GENL_ID_GENERATE,
@@ -120,7 +120,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 *		goto err;
 	 */
 
-err:
+	rc = delayacct_add_tsk(stats, tsk);
+	stats->version = TASKSTATS_VERSION;
+
+	/* Define err: label here if needed */
 	put_task_struct(tsk);
 	return rc;
 
@@ -152,8 +155,14 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		 *		break;
 		 */
 
+		rc = delayacct_add_tsk(stats, tsk);
+		if (rc)
+			break;
+
 	} while_each_thread(first, tsk);
 	read_unlock(&tasklist_lock);
+	stats->version = TASKSTATS_VERSION;
+
 
 	/*
 	 * Accounting subsytems can also add calls here if they don't
@@ -233,8 +242,6 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	if (!family_registered || !tidstats)
 		return;
 
-	mutex_lock(&taskstats_exit_mutex);
-
 	is_thread_group = !thread_group_empty(tsk);
 	rc = 0;
 
@@ -292,7 +299,6 @@ nla_put_failure:
 err_skb:
 	nlmsg_free(rep_skb);
 ret:
-	mutex_unlock(&taskstats_exit_mutex);
 	return;
 }
 
-- 
cgit v1.2.3


From ad4ecbcba72855a2b5319b96e2a3a65ed1ca3bfd Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:44 -0700
Subject: [PATCH] delay accounting taskstats interface send tgid once

Send per-tgid data only once during exit of a thread group instead of once
with each member thread exit.

Currently, when a thread exits, besides its per-tid data, the per-tgid data
of its thread group is also sent out, if its thread group is non-empty.
The per-tgid data sent consists of the sum of per-tid stats for all
*remaining* threads of the thread group.

This patch modifies this sending in two ways:

- the per-tgid data is sent only when the last thread of a thread group
  exits.  This cuts down heavily on the overhead of sending/receiving
  per-tgid data, especially when other exploiters of the taskstats
  interface aren't interested in per-tgid stats

- the semantics of the per-tgid data sent are changed.  Instead of being
  the sum of per-tid data for remaining threads, the value now sent is the
  true total accumalated statistics for all threads that are/were part of
  the thread group.

The patch also addresses a minor issue where failure of one accounting
subsystem to fill in the taskstats structure was causing the send of
taskstats to not be sent at all.

The patch has been tested for stability and run cerberus for over 4 hours
on an SMP.

[akpm@osdl.org: bugfixes]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 98 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 66 insertions(+), 32 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index ea9506de3b85..4a0a5022b299 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -132,46 +132,79 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
 		struct taskstats *stats)
 {
-	int rc;
 	struct task_struct *tsk, *first;
+	unsigned long flags;
 
+	/*
+	 * Add additional stats from live tasks except zombie thread group
+	 * leaders who are already counted with the dead tasks
+	 */
 	first = tgidtsk;
-	read_lock(&tasklist_lock);
 	if (!first) {
+		read_lock(&tasklist_lock);
 		first = find_task_by_pid(tgid);
 		if (!first) {
 			read_unlock(&tasklist_lock);
 			return -ESRCH;
 		}
-	}
+		get_task_struct(first);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(first);
+
+	/* Start with stats from dead tasks */
+	spin_lock_irqsave(&first->signal->stats_lock, flags);
+	if (first->signal->stats)
+		memcpy(stats, first->signal->stats, sizeof(*stats));
+	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+
 	tsk = first;
+	read_lock(&tasklist_lock);
 	do {
+		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+			continue;
 		/*
-		 * Each accounting subsystem adds calls its functions to
+		 * Accounting subsystem can call its functions here to
 		 * fill in relevant parts of struct taskstsats as follows
 		 *
-		 *	rc = per-task-foo(stats, tsk);
-		 *	if (rc)
-		 *		break;
+		 *	per-task-foo(stats, tsk);
 		 */
-
-		rc = delayacct_add_tsk(stats, tsk);
-		if (rc)
-			break;
+		delayacct_add_tsk(stats, tsk);
 
 	} while_each_thread(first, tsk);
 	read_unlock(&tasklist_lock);
 	stats->version = TASKSTATS_VERSION;
 
-
 	/*
-	 * Accounting subsytems can also add calls here if they don't
-	 * wish to aggregate statistics for per-tgid stats
+	 * Accounting subsytems can also add calls here to modify
+	 * fields of taskstats.
 	 */
 
-	return rc;
+	return 0;
+}
+
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	if (!tsk->signal->stats)
+		goto ret;
+
+	/*
+	 * Each accounting subsystem calls its functions here to
+	 * accumalate its per-task stats for tsk, into the per-tgid structure
+	 *
+	 *	per-task-foo(tsk->signal->stats, tsk);
+	 */
+	delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+	return;
 }
 
+
 static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
@@ -230,7 +263,7 @@ err:
 
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			struct taskstats *tgidstats)
+			int group_dead)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -238,13 +271,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	size_t size;
 	int is_thread_group;
 	struct nlattr *na;
+	unsigned long flags;
 
 	if (!family_registered || !tidstats)
 		return;
 
-	is_thread_group = !thread_group_empty(tsk);
-	rc = 0;
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	is_thread_group = tsk->signal->stats ? 1 : 0;
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
 
+	rc = 0;
 	/*
 	 * Size includes space for nested attributes
 	 */
@@ -268,30 +304,28 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 			*tidstats);
 	nla_nest_end(rep_skb, na);
 
-	if (!is_thread_group || !tgidstats) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+	if (!is_thread_group)
+		goto send;
 
-	rc = fill_tgid(tsk->pid, tsk, tgidstats);
 	/*
-	 * If fill_tgid() failed then one probable reason could be that the
-	 * thread group leader has exited. fill_tgid() will fail, send out
-	 * the pid statistics collected earlier.
+	 * tsk has/had a thread group so fill the tsk->signal->stats structure
+	 * Doesn't matter if tsk is the leader or the last group member leaving
 	 */
-	if (rc < 0) {
-		send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-		goto ret;
-	}
+
+	fill_tgid_exit(tsk);
+	if (!group_dead)
+		goto send;
 
 	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
 	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	/* No locking needed for tsk->signal->stats since group is dead */
 	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
-			*tgidstats);
+			*tsk->signal->stats);
 	nla_nest_end(rep_skb, na);
 
+send:
 	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
-	goto ret;
+	return;
 
 nla_put_failure:
 	genlmsg_cancel(rep_skb, reply);
-- 
cgit v1.2.3


From f9fd8914c1acca0d98b69d831b128d5b52f03c51 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:47 -0700
Subject: [PATCH] per-task delay accounting taskstats interface: control exit
 data through cpumasks

On systems with a large number of cpus, with even a modest rate of tasks
exiting per cpu, the volume of taskstats data sent on thread exit can
overflow a userspace listener's buffers.

One approach to avoiding overflow is to allow listeners to get data for a
limited and specific set of cpus.  By scaling the number of listeners
and/or the cpus they monitor, userspace can handle the statistical data
overload more gracefully.

In this patch, each listener registers to listen to a specific set of cpus
by specifying a cpumask.  The interest is recorded per-cpu.  When a task
exits on a cpu, its taskstats data is unicast to each listener interested
in that cpu.

Thanks to Andrew Morton for pointing out the various scalability and
general concerns of previous attempts and for suggesting this design.

[akpm@osdl.org: build fix]
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 189 insertions(+), 11 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a0a5022b299..abb59e323544 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -19,9 +19,17 @@
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
 #include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
+
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
@@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
 __read_mostly = {
 	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
 	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+struct listener {
+	struct list_head list;
+	pid_t pid;
 };
 
+struct listener_list {
+	struct rw_semaphore sem;
+	struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+	REGISTER,
+	DEREGISTER,
+	CPU_DONT_CARE
+};
 
 static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 			void **replyp, size_t size)
@@ -74,25 +99,68 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
 	return 0;
 }
 
-static int send_reply(struct sk_buff *skb, pid_t pid, int event)
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
-	void *reply;
+	void *reply = genlmsg_data(genlhdr);
 	int rc;
 
-	reply = genlmsg_data(genlhdr);
-
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
 		return rc;
 	}
 
-	if (event == TASKSTATS_MSG_MULTICAST)
-		return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP);
 	return genlmsg_unicast(skb, pid);
 }
 
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	struct sk_buff *skb_next, *skb_cur = skb;
+	void *reply = genlmsg_data(genlhdr);
+	int rc, ret;
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	rc = 0;
+	listeners = &per_cpu(listener_array, cpu);
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		skb_next = NULL;
+		if (!list_is_last(&s->list, &listeners->list)) {
+			skb_next = skb_clone(skb_cur, GFP_KERNEL);
+			if (!skb_next) {
+				nlmsg_free(skb_cur);
+				rc = -ENOMEM;
+				break;
+			}
+		}
+		ret = genlmsg_unicast(skb_cur, s->pid);
+		if (ret == -ECONNREFUSED) {
+			list_del(&s->list);
+			kfree(s);
+			rc = ret;
+		}
+		skb_cur = skb_next;
+	}
+	up_write(&listeners->sem);
+
+	return rc;
+}
+
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 		struct taskstats *stats)
 {
@@ -204,8 +272,73 @@ ret:
 	return;
 }
 
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	unsigned int cpu;
+	cpumask_t mask = *maskp;
 
-static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
+	if (!cpus_subset(mask, cpu_possible_map))
+		return -EINVAL;
+
+	if (isadd == REGISTER) {
+		for_each_cpu_mask(cpu, mask) {
+			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+					 cpu_to_node(cpu));
+			if (!s)
+				goto cleanup;
+			s->pid = pid;
+			INIT_LIST_HEAD(&s->list);
+
+			listeners = &per_cpu(listener_array, cpu);
+			down_write(&listeners->sem);
+			list_add(&s->list, &listeners->list);
+			up_write(&listeners->sem);
+		}
+		return 0;
+	}
+
+	/* Deregister or cleanup */
+cleanup:
+	for_each_cpu_mask(cpu, mask) {
+		listeners = &per_cpu(listener_array, cpu);
+		down_write(&listeners->sem);
+		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+			if (s->pid == pid) {
+				list_del(&s->list);
+				kfree(s);
+				break;
+			}
+		}
+		up_write(&listeners->sem);
+	}
+	return 0;
+}
+
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+	char *data;
+	int len;
+	int ret;
+
+	if (na == NULL)
+		return 1;
+	len = nla_len(na);
+	if (len > TASKSTATS_CPUMASK_MAXLEN)
+		return -E2BIG;
+	if (len < 1)
+		return -EINVAL;
+	data = kmalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	nla_strlcpy(data, na, len);
+	ret = cpulist_parse(data, *mask);
+	kfree(data);
+	return ret;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 {
 	int rc = 0;
 	struct sk_buff *rep_skb;
@@ -213,6 +346,19 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 	void *reply;
 	size_t size;
 	struct nlattr *na;
+	cpumask_t mask;
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, REGISTER);
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
 
 	/*
 	 * Size includes space for nested attributes
@@ -252,7 +398,7 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info)
 
 	nla_nest_end(rep_skb, na);
 
-	return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST);
+	return send_reply(rep_skb, info->snd_pid);
 
 nla_put_failure:
 	return genlmsg_cancel(rep_skb, reply);
@@ -261,9 +407,35 @@ err:
 	return rc;
 }
 
+void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+{
+	struct listener_list *listeners;
+	struct taskstats *tmp;
+	/*
+	 * This is the cpu on which the task is exiting currently and will
+	 * be the one for which the exit event is sent, even if the cpu
+	 * on which this function is running changes later.
+	 */
+	*mycpu = raw_smp_processor_id();
+
+	*ptidstats = NULL;
+	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	if (!tmp)
+		return;
+
+	listeners = &per_cpu(listener_array, *mycpu);
+	down_read(&listeners->sem);
+	if (!list_empty(&listeners->list)) {
+		*ptidstats = tmp;
+		tmp = NULL;
+	}
+	up_read(&listeners->sem);
+	kfree(tmp);
+}
+
 /* Send pid data out on exit */
 void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
-			int group_dead)
+			int group_dead, unsigned int mycpu)
 {
 	int rc;
 	struct sk_buff *rep_skb;
@@ -324,7 +496,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
 	nla_nest_end(rep_skb, na);
 
 send:
-	send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST);
+	send_cpu_listeners(rep_skb, mycpu);
 	return;
 
 nla_put_failure:
@@ -338,16 +510,22 @@ ret:
 
 static struct genl_ops taskstats_ops = {
 	.cmd		= TASKSTATS_CMD_GET,
-	.doit		= taskstats_send_stats,
+	.doit		= taskstats_user_cmd,
 	.policy		= taskstats_cmd_get_policy,
 };
 
 /* Needed early in initialization */
 void __init taskstats_init_early(void)
 {
+	unsigned int i;
+
 	taskstats_cache = kmem_cache_create("taskstats_cache",
 						sizeof(struct taskstats),
 						0, SLAB_PANIC, NULL, NULL);
+	for_each_possible_cpu(i) {
+		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+		init_rwsem(&(per_cpu(listener_array, i).sem));
+	}
 }
 
 static int __init taskstats_init(void)
-- 
cgit v1.2.3


From bb129994c3bff9c5e8df91f05d7e9b6402fbd83f Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Fri, 14 Jul 2006 00:24:47 -0700
Subject: [PATCH] Remove down_write() from taskstats code invoked on the exit()
 path

In send_cpu_listeners(), which is called on the exit path, a down_write()
was protecting operations like skb_clone() and genlmsg_unicast() that do
GFP_KERNEL allocations.  If the oom-killer decides to kill tasks to satisfy
the allocations,the exit of those tasks could block on the same semphore.

The down_write() was only needed to allow removal of invalid listeners from
the listener list.  The patch converts the down_write to a down_read and
defers the removal to a separate critical region.  This ensures that even
if the oom-killer is called, no other task's exit is blocked as it can
still acquire another down_read.

Thanks to Andrew Morton & Herbert Xu for pointing out the oom related
pitfalls, and to Chandra Seetharaman for suggesting this fix instead of
using something more complex like RCU.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index abb59e323544..f45179ce028e 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -51,6 +51,7 @@ __read_mostly = {
 struct listener {
 	struct list_head list;
 	pid_t pid;
+	char valid;
 };
 
 struct listener_list {
@@ -127,7 +128,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, ret;
+	int rc, ret, delcount = 0;
 
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
@@ -137,7 +138,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 
 	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
-	down_write(&listeners->sem);
+	down_read(&listeners->sem);
 	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
 		skb_next = NULL;
 		if (!list_is_last(&s->list, &listeners->list)) {
@@ -150,14 +151,26 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 		}
 		ret = genlmsg_unicast(skb_cur, s->pid);
 		if (ret == -ECONNREFUSED) {
-			list_del(&s->list);
-			kfree(s);
+			s->valid = 0;
+			delcount++;
 			rc = ret;
 		}
 		skb_cur = skb_next;
 	}
-	up_write(&listeners->sem);
+	up_read(&listeners->sem);
+
+	if (!delcount)
+		return rc;
 
+	/* Delete invalidated entries */
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		if (!s->valid) {
+			list_del(&s->list);
+			kfree(s);
+		}
+	}
+	up_write(&listeners->sem);
 	return rc;
 }
 
@@ -290,6 +303,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
 				goto cleanup;
 			s->pid = pid;
 			INIT_LIST_HEAD(&s->list);
+			s->valid = 1;
 
 			listeners = &per_cpu(listener_array, cpu);
 			down_write(&listeners->sem);
-- 
cgit v1.2.3


From 7d94dddd438bcba97db44f120da39bb001b5249f Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Sun, 30 Jul 2006 03:03:10 -0700
Subject: [PATCH] make taskstats sending completely independent of delay
 accounting on/off status

Complete the separation of delay accounting and taskstats by ignoring the
return value of delay accounting functions that fill in parts of taskstats
before it is sent out (either in response to a command or as part of a task
exit).

Also make delayacct_add_tsk return silently when delay accounting is turned
off rather than treat it as an error.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index f45179ce028e..b4c737a11408 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -177,7 +177,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 		struct taskstats *stats)
 {
-	int rc;
+	int rc = 0;
 	struct task_struct *tsk = pidtsk;
 
 	if (!pidtsk) {
@@ -196,12 +196,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
 	 * Each accounting subsystem adds calls to its functions to
 	 * fill in relevant parts of struct taskstsats as follows
 	 *
-	 *	rc = per-task-foo(stats, tsk);
-	 *	if (rc)
-	 *		goto err;
+	 *	per-task-foo(stats, tsk);
 	 */
 
-	rc = delayacct_add_tsk(stats, tsk);
+	delayacct_add_tsk(stats, tsk);
 	stats->version = TASKSTATS_VERSION;
 
 	/* Define err: label here if needed */
-- 
cgit v1.2.3


From d94a041519f3ab1ac023bf917619cd8c4a7d3c01 Mon Sep 17 00:00:00 2001
From: Shailabh Nagar <nagar@watson.ibm.com>
Date: Sun, 30 Jul 2006 03:03:11 -0700
Subject: [PATCH] taskstats: free skb, avoid returns in send_cpu_listeners

Add a missing freeing of skb in the case there are no listeners at all.
Also remove the returning of error values by the function as it is unused
by the sole caller.

Signed-off-by: Shailabh Nagar <nagar@watson.ibm.com>
Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/taskstats.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

(limited to 'kernel/taskstats.c')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index b4c737a11408..e78187657330 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -121,46 +121,45 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
 /*
  * Send taskstats data in @skb to listeners registered for @cpu's exit data
  */
-static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 {
 	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
 	struct listener_list *listeners;
 	struct listener *s, *tmp;
 	struct sk_buff *skb_next, *skb_cur = skb;
 	void *reply = genlmsg_data(genlhdr);
-	int rc, ret, delcount = 0;
+	int rc, delcount = 0;
 
 	rc = genlmsg_end(skb, reply);
 	if (rc < 0) {
 		nlmsg_free(skb);
-		return rc;
+		return;
 	}
 
 	rc = 0;
 	listeners = &per_cpu(listener_array, cpu);
 	down_read(&listeners->sem);
-	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+	list_for_each_entry(s, &listeners->list, list) {
 		skb_next = NULL;
 		if (!list_is_last(&s->list, &listeners->list)) {
 			skb_next = skb_clone(skb_cur, GFP_KERNEL);
-			if (!skb_next) {
-				nlmsg_free(skb_cur);
-				rc = -ENOMEM;
+			if (!skb_next)
 				break;
-			}
 		}
-		ret = genlmsg_unicast(skb_cur, s->pid);
-		if (ret == -ECONNREFUSED) {
+		rc = genlmsg_unicast(skb_cur, s->pid);
+		if (rc == -ECONNREFUSED) {
 			s->valid = 0;
 			delcount++;
-			rc = ret;
 		}
 		skb_cur = skb_next;
 	}
 	up_read(&listeners->sem);
 
+	if (skb_cur)
+		nlmsg_free(skb_cur);
+
 	if (!delcount)
-		return rc;
+		return;
 
 	/* Delete invalidated entries */
 	down_write(&listeners->sem);
@@ -171,7 +170,6 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
 		}
 	}
 	up_write(&listeners->sem);
-	return rc;
 }
 
 static int fill_pid(pid_t pid, struct task_struct *pidtsk,
-- 
cgit v1.2.3