From c757249af152c59fd74b85e52e8c090acb33d9c0 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:40 -0700 Subject: [PATCH] per-task-delay-accounting: taskstats interface Create a "taskstats" interface based on generic netlink (NETLINK_GENERIC family), for getting statistics of tasks and thread groups during their lifetime and when they exit. The interface is intended for use by multiple accounting packages though it is being created in the context of delay accounting. This patch creates the interface without populating the fields of the data that is sent to the user in response to a command or upon the exit of a task. Each accounting package interested in using taskstats has to provide an additional patch to add its stats to the common structure. [akpm@osdl.org: cleanups, Kconfig fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 336 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 336 insertions(+) create mode 100644 kernel/taskstats.c (limited to 'kernel/taskstats.c') diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 000000000000..82ec9137d908 --- /dev/null +++ b/kernel/taskstats.c @@ -0,0 +1,336 @@ +/* + * taskstats.c - Export per-task statistics to userland + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * (C) Balbir Singh, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include + +static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; +static int family_registered; +kmem_cache_t *taskstats_cache; +static DEFINE_MUTEX(taskstats_exit_mutex); + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, +}; + +static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] +__read_mostly = { + [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, +}; + + +static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, + void **replyp, size_t size) +{ + struct sk_buff *skb; + void *reply; + + /* + * If new attributes are added, please revisit this allocation + */ + skb = nlmsg_new(size); + if (!skb) + return -ENOMEM; + + if (!info) { + int seq = get_cpu_var(taskstats_seqnum)++; + put_cpu_var(taskstats_seqnum); + + reply = genlmsg_put(skb, 0, seq, + family.id, 0, 0, + cmd, family.version); + } else + reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, + family.id, 0, 0, + cmd, family.version); + if (reply == NULL) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + *replyp = reply; + return 0; +} + +static int send_reply(struct sk_buff *skb, pid_t pid, int event) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + void *reply; + int rc; + + reply = genlmsg_data(genlhdr); + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + if (event == TASKSTATS_MSG_MULTICAST) + return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP); + return genlmsg_unicast(skb, pid); +} + +static int fill_pid(pid_t pid, struct task_struct *pidtsk, + struct taskstats *stats) +{ + int rc; + struct task_struct *tsk = pidtsk; + + if (!pidtsk) { + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(tsk); + read_unlock(&tasklist_lock); + } else + get_task_struct(tsk); + + /* + * Each accounting subsystem adds calls to its functions to + * fill in relevant parts of struct taskstsats as follows + * + * rc = per-task-foo(stats, tsk); + * if (rc) + * goto err; + */ + +err: + put_task_struct(tsk); + return rc; + +} + +static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, + struct taskstats *stats) +{ + int rc; + struct task_struct *tsk, *first; + + first = tgidtsk; + read_lock(&tasklist_lock); + if (!first) { + first = find_task_by_pid(tgid); + if (!first) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + } + tsk = first; + do { + /* + * Each accounting subsystem adds calls its functions to + * fill in relevant parts of struct taskstsats as follows + * + * rc = per-task-foo(stats, tsk); + * if (rc) + * break; + */ + + } while_each_thread(first, tsk); + read_unlock(&tasklist_lock); + + /* + * Accounting subsytems can also add calls here if they don't + * wish to aggregate statistics for per-tgid stats + */ + + return rc; +} + +static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + struct sk_buff *rep_skb; + struct taskstats stats; + void *reply; + size_t size; + struct nlattr *na; + + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + memset(&stats, 0, sizeof(stats)); + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + return rc; + + if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { + u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + rc = fill_pid(pid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { + u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + rc = fill_tgid(tgid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else { + rc = -EINVAL; + goto err; + } + + nla_nest_end(rep_skb, na); + + return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); + +nla_put_failure: + return genlmsg_cancel(rep_skb, reply); +err: + nlmsg_free(rep_skb); + return rc; +} + +/* Send pid data out on exit */ +void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, + struct taskstats *tgidstats) +{ + int rc; + struct sk_buff *rep_skb; + void *reply; + size_t size; + int is_thread_group; + struct nlattr *na; + + if (!family_registered || !tidstats) + return; + + mutex_lock(&taskstats_exit_mutex); + + is_thread_group = !thread_group_empty(tsk); + rc = 0; + + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + if (is_thread_group) + size = 2 * size; /* PID + STATS + TGID + STATS */ + + rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + goto ret; + + rc = fill_pid(tsk->pid, tsk, tidstats); + if (rc < 0) + goto err_skb; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tidstats); + nla_nest_end(rep_skb, na); + + if (!is_thread_group || !tgidstats) { + send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + goto ret; + } + + rc = fill_tgid(tsk->pid, tsk, tgidstats); + /* + * If fill_tgid() failed then one probable reason could be that the + * thread group leader has exited. fill_tgid() will fail, send out + * the pid statistics collected earlier. + */ + if (rc < 0) { + send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + goto ret; + } + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tgidstats); + nla_nest_end(rep_skb, na); + + send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + goto ret; + +nla_put_failure: + genlmsg_cancel(rep_skb, reply); + goto ret; +err_skb: + nlmsg_free(rep_skb); +ret: + mutex_unlock(&taskstats_exit_mutex); + return; +} + +static struct genl_ops taskstats_ops = { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_send_stats, + .policy = taskstats_cmd_get_policy, +}; + +/* Needed early in initialization */ +void __init taskstats_init_early(void) +{ + taskstats_cache = kmem_cache_create("taskstats_cache", + sizeof(struct taskstats), + 0, SLAB_PANIC, NULL, NULL); +} + +static int __init taskstats_init(void) +{ + int rc; + + rc = genl_register_family(&family); + if (rc) + return rc; + + rc = genl_register_ops(&family, &taskstats_ops); + if (rc < 0) + goto err; + + family_registered = 1; + return 0; +err: + genl_unregister_family(&family); + return rc; +} + +/* + * late initcall ensures initialization of statistics collection + * mechanisms precedes initialization of the taskstats interface + */ +late_initcall(taskstats_init); -- cgit v1.2.3 From 6f44993fe1d7b2b097f6ac60cd5835c6f5ca0874 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:41 -0700 Subject: [PATCH] per-task-delay-accounting: delay accounting usage of taskstats interface Usage of taskstats interface by delay accounting. Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel/taskstats.c') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 82ec9137d908..ea9506de3b85 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -18,13 +18,13 @@ #include #include +#include #include #include static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; static int family_registered; kmem_cache_t *taskstats_cache; -static DEFINE_MUTEX(taskstats_exit_mutex); static struct genl_family family = { .id = GENL_ID_GENERATE, @@ -120,7 +120,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, * goto err; */ -err: + rc = delayacct_add_tsk(stats, tsk); + stats->version = TASKSTATS_VERSION; + + /* Define err: label here if needed */ put_task_struct(tsk); return rc; @@ -152,8 +155,14 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, * break; */ + rc = delayacct_add_tsk(stats, tsk); + if (rc) + break; + } while_each_thread(first, tsk); read_unlock(&tasklist_lock); + stats->version = TASKSTATS_VERSION; + /* * Accounting subsytems can also add calls here if they don't @@ -233,8 +242,6 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, if (!family_registered || !tidstats) return; - mutex_lock(&taskstats_exit_mutex); - is_thread_group = !thread_group_empty(tsk); rc = 0; @@ -292,7 +299,6 @@ nla_put_failure: err_skb: nlmsg_free(rep_skb); ret: - mutex_unlock(&taskstats_exit_mutex); return; } -- cgit v1.2.3 From ad4ecbcba72855a2b5319b96e2a3a65ed1ca3bfd Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:44 -0700 Subject: [PATCH] delay accounting taskstats interface send tgid once Send per-tgid data only once during exit of a thread group instead of once with each member thread exit. Currently, when a thread exits, besides its per-tid data, the per-tgid data of its thread group is also sent out, if its thread group is non-empty. The per-tgid data sent consists of the sum of per-tid stats for all *remaining* threads of the thread group. This patch modifies this sending in two ways: - the per-tgid data is sent only when the last thread of a thread group exits. This cuts down heavily on the overhead of sending/receiving per-tgid data, especially when other exploiters of the taskstats interface aren't interested in per-tgid stats - the semantics of the per-tgid data sent are changed. Instead of being the sum of per-tid data for remaining threads, the value now sent is the true total accumalated statistics for all threads that are/were part of the thread group. The patch also addresses a minor issue where failure of one accounting subsystem to fill in the taskstats structure was causing the send of taskstats to not be sent at all. The patch has been tested for stability and run cerberus for over 4 hours on an SMP. [akpm@osdl.org: bugfixes] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 98 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 32 deletions(-) (limited to 'kernel/taskstats.c') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ea9506de3b85..4a0a5022b299 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -132,46 +132,79 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, struct taskstats *stats) { - int rc; struct task_struct *tsk, *first; + unsigned long flags; + /* + * Add additional stats from live tasks except zombie thread group + * leaders who are already counted with the dead tasks + */ first = tgidtsk; - read_lock(&tasklist_lock); if (!first) { + read_lock(&tasklist_lock); first = find_task_by_pid(tgid); if (!first) { read_unlock(&tasklist_lock); return -ESRCH; } - } + get_task_struct(first); + read_unlock(&tasklist_lock); + } else + get_task_struct(first); + + /* Start with stats from dead tasks */ + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + tsk = first; + read_lock(&tasklist_lock); do { + if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + continue; /* - * Each accounting subsystem adds calls its functions to + * Accounting subsystem can call its functions here to * fill in relevant parts of struct taskstsats as follows * - * rc = per-task-foo(stats, tsk); - * if (rc) - * break; + * per-task-foo(stats, tsk); */ - - rc = delayacct_add_tsk(stats, tsk); - if (rc) - break; + delayacct_add_tsk(stats, tsk); } while_each_thread(first, tsk); read_unlock(&tasklist_lock); stats->version = TASKSTATS_VERSION; - /* - * Accounting subsytems can also add calls here if they don't - * wish to aggregate statistics for per-tgid stats + * Accounting subsytems can also add calls here to modify + * fields of taskstats. */ - return rc; + return 0; +} + + +static void fill_tgid_exit(struct task_struct *tsk) +{ + unsigned long flags; + + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + if (!tsk->signal->stats) + goto ret; + + /* + * Each accounting subsystem calls its functions here to + * accumalate its per-task stats for tsk, into the per-tgid structure + * + * per-task-foo(tsk->signal->stats, tsk); + */ + delayacct_add_tsk(tsk->signal->stats, tsk); +ret: + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + return; } + static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) { int rc = 0; @@ -230,7 +263,7 @@ err: /* Send pid data out on exit */ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - struct taskstats *tgidstats) + int group_dead) { int rc; struct sk_buff *rep_skb; @@ -238,13 +271,16 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, size_t size; int is_thread_group; struct nlattr *na; + unsigned long flags; if (!family_registered || !tidstats) return; - is_thread_group = !thread_group_empty(tsk); - rc = 0; + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + is_thread_group = tsk->signal->stats ? 1 : 0; + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + rc = 0; /* * Size includes space for nested attributes */ @@ -268,30 +304,28 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, *tidstats); nla_nest_end(rep_skb, na); - if (!is_thread_group || !tgidstats) { - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); - goto ret; - } + if (!is_thread_group) + goto send; - rc = fill_tgid(tsk->pid, tsk, tgidstats); /* - * If fill_tgid() failed then one probable reason could be that the - * thread group leader has exited. fill_tgid() will fail, send out - * the pid statistics collected earlier. + * tsk has/had a thread group so fill the tsk->signal->stats structure + * Doesn't matter if tsk is the leader or the last group member leaving */ - if (rc < 0) { - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); - goto ret; - } + + fill_tgid_exit(tsk); + if (!group_dead) + goto send; na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); + /* No locking needed for tsk->signal->stats since group is dead */ NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, - *tgidstats); + *tsk->signal->stats); nla_nest_end(rep_skb, na); +send: send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); - goto ret; + return; nla_put_failure: genlmsg_cancel(rep_skb, reply); -- cgit v1.2.3 From f9fd8914c1acca0d98b69d831b128d5b52f03c51 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:47 -0700 Subject: [PATCH] per-task delay accounting taskstats interface: control exit data through cpumasks On systems with a large number of cpus, with even a modest rate of tasks exiting per cpu, the volume of taskstats data sent on thread exit can overflow a userspace listener's buffers. One approach to avoiding overflow is to allow listeners to get data for a limited and specific set of cpus. By scaling the number of listeners and/or the cpus they monitor, userspace can handle the statistical data overload more gracefully. In this patch, each listener registers to listen to a specific set of cpus by specifying a cpumask. The interest is recorded per-cpu. When a task exits on a cpu, its taskstats data is unicast to each listener interested in that cpu. Thanks to Andrew Morton for pointing out the various scalability and general concerns of previous attempts and for suggesting this design. [akpm@osdl.org: build fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Signed-off-by: Chandra Seetharaman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 189 insertions(+), 11 deletions(-) (limited to 'kernel/taskstats.c') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4a0a5022b299..abb59e323544 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -19,9 +19,17 @@ #include #include #include +#include +#include #include #include +/* + * Maximum length of a cpumask that can be specified in + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute + */ +#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) + static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; static int family_registered; kmem_cache_t *taskstats_cache; @@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] __read_mostly = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, + [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; + +struct listener { + struct list_head list; + pid_t pid; }; +struct listener_list { + struct rw_semaphore sem; + struct list_head list; +}; +static DEFINE_PER_CPU(struct listener_list, listener_array); + +enum actions { + REGISTER, + DEREGISTER, + CPU_DONT_CARE +}; static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, void **replyp, size_t size) @@ -74,25 +99,68 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, return 0; } -static int send_reply(struct sk_buff *skb, pid_t pid, int event) +/* + * Send taskstats data in @skb to listener with nl_pid @pid + */ +static int send_reply(struct sk_buff *skb, pid_t pid) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); - void *reply; + void *reply = genlmsg_data(genlhdr); int rc; - reply = genlmsg_data(genlhdr); - rc = genlmsg_end(skb, reply); if (rc < 0) { nlmsg_free(skb); return rc; } - if (event == TASKSTATS_MSG_MULTICAST) - return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP); return genlmsg_unicast(skb, pid); } +/* + * Send taskstats data in @skb to listeners registered for @cpu's exit data + */ +static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + struct listener_list *listeners; + struct listener *s, *tmp; + struct sk_buff *skb_next, *skb_cur = skb; + void *reply = genlmsg_data(genlhdr); + int rc, ret; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + rc = 0; + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + skb_next = NULL; + if (!list_is_last(&s->list, &listeners->list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); + if (!skb_next) { + nlmsg_free(skb_cur); + rc = -ENOMEM; + break; + } + } + ret = genlmsg_unicast(skb_cur, s->pid); + if (ret == -ECONNREFUSED) { + list_del(&s->list); + kfree(s); + rc = ret; + } + skb_cur = skb_next; + } + up_write(&listeners->sem); + + return rc; +} + static int fill_pid(pid_t pid, struct task_struct *pidtsk, struct taskstats *stats) { @@ -204,8 +272,73 @@ ret: return; } +static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +{ + struct listener_list *listeners; + struct listener *s, *tmp; + unsigned int cpu; + cpumask_t mask = *maskp; -static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) + if (!cpus_subset(mask, cpu_possible_map)) + return -EINVAL; + + if (isadd == REGISTER) { + for_each_cpu_mask(cpu, mask) { + s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, + cpu_to_node(cpu)); + if (!s) + goto cleanup; + s->pid = pid; + INIT_LIST_HEAD(&s->list); + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_add(&s->list, &listeners->list); + up_write(&listeners->sem); + } + return 0; + } + + /* Deregister or cleanup */ +cleanup: + for_each_cpu_mask(cpu, mask) { + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (s->pid == pid) { + list_del(&s->list); + kfree(s); + break; + } + } + up_write(&listeners->sem); + } + return 0; +} + +static int parse(struct nlattr *na, cpumask_t *mask) +{ + char *data; + int len; + int ret; + + if (na == NULL) + return 1; + len = nla_len(na); + if (len > TASKSTATS_CPUMASK_MAXLEN) + return -E2BIG; + if (len < 1) + return -EINVAL; + data = kmalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + nla_strlcpy(data, na, len); + ret = cpulist_parse(data, *mask); + kfree(data); + return ret; +} + +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { int rc = 0; struct sk_buff *rep_skb; @@ -213,6 +346,19 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) void *reply; size_t size; struct nlattr *na; + cpumask_t mask; + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, REGISTER); + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, DEREGISTER); /* * Size includes space for nested attributes @@ -252,7 +398,7 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) nla_nest_end(rep_skb, na); - return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); + return send_reply(rep_skb, info->snd_pid); nla_put_failure: return genlmsg_cancel(rep_skb, reply); @@ -261,9 +407,35 @@ err: return rc; } +void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) +{ + struct listener_list *listeners; + struct taskstats *tmp; + /* + * This is the cpu on which the task is exiting currently and will + * be the one for which the exit event is sent, even if the cpu + * on which this function is running changes later. + */ + *mycpu = raw_smp_processor_id(); + + *ptidstats = NULL; + tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + if (!tmp) + return; + + listeners = &per_cpu(listener_array, *mycpu); + down_read(&listeners->sem); + if (!list_empty(&listeners->list)) { + *ptidstats = tmp; + tmp = NULL; + } + up_read(&listeners->sem); + kfree(tmp); +} + /* Send pid data out on exit */ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - int group_dead) + int group_dead, unsigned int mycpu) { int rc; struct sk_buff *rep_skb; @@ -324,7 +496,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, nla_nest_end(rep_skb, na); send: - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + send_cpu_listeners(rep_skb, mycpu); return; nla_put_failure: @@ -338,16 +510,22 @@ ret: static struct genl_ops taskstats_ops = { .cmd = TASKSTATS_CMD_GET, - .doit = taskstats_send_stats, + .doit = taskstats_user_cmd, .policy = taskstats_cmd_get_policy, }; /* Needed early in initialization */ void __init taskstats_init_early(void) { + unsigned int i; + taskstats_cache = kmem_cache_create("taskstats_cache", sizeof(struct taskstats), 0, SLAB_PANIC, NULL, NULL); + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); + init_rwsem(&(per_cpu(listener_array, i).sem)); + } } static int __init taskstats_init(void) -- cgit v1.2.3 From bb129994c3bff9c5e8df91f05d7e9b6402fbd83f Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:47 -0700 Subject: [PATCH] Remove down_write() from taskstats code invoked on the exit() path In send_cpu_listeners(), which is called on the exit path, a down_write() was protecting operations like skb_clone() and genlmsg_unicast() that do GFP_KERNEL allocations. If the oom-killer decides to kill tasks to satisfy the allocations,the exit of those tasks could block on the same semphore. The down_write() was only needed to allow removal of invalid listeners from the listener list. The patch converts the down_write to a down_read and defers the removal to a separate critical region. This ensures that even if the oom-killer is called, no other task's exit is blocked as it can still acquire another down_read. Thanks to Andrew Morton & Herbert Xu for pointing out the oom related pitfalls, and to Chandra Seetharaman for suggesting this fix instead of using something more complex like RCU. Signed-off-by: Chandra Seetharaman Signed-off-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel/taskstats.c') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index abb59e323544..f45179ce028e 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -51,6 +51,7 @@ __read_mostly = { struct listener { struct list_head list; pid_t pid; + char valid; }; struct listener_list { @@ -127,7 +128,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); - int rc, ret; + int rc, ret, delcount = 0; rc = genlmsg_end(skb, reply); if (rc < 0) { @@ -137,7 +138,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) rc = 0; listeners = &per_cpu(listener_array, cpu); - down_write(&listeners->sem); + down_read(&listeners->sem); list_for_each_entry_safe(s, tmp, &listeners->list, list) { skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { @@ -150,14 +151,26 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) } ret = genlmsg_unicast(skb_cur, s->pid); if (ret == -ECONNREFUSED) { - list_del(&s->list); - kfree(s); + s->valid = 0; + delcount++; rc = ret; } skb_cur = skb_next; } - up_write(&listeners->sem); + up_read(&listeners->sem); + + if (!delcount) + return rc; + /* Delete invalidated entries */ + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (!s->valid) { + list_del(&s->list); + kfree(s); + } + } + up_write(&listeners->sem); return rc; } @@ -290,6 +303,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) goto cleanup; s->pid = pid; INIT_LIST_HEAD(&s->list); + s->valid = 1; listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); -- cgit v1.2.3 From 7d94dddd438bcba97db44f120da39bb001b5249f Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Sun, 30 Jul 2006 03:03:10 -0700 Subject: [PATCH] make taskstats sending completely independent of delay accounting on/off status Complete the separation of delay accounting and taskstats by ignoring the return value of delay accounting functions that fill in parts of taskstats before it is sent out (either in response to a command or as part of a task exit). Also make delayacct_add_tsk return silently when delay accounting is turned off rather than treat it as an error. Signed-off-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel/taskstats.c') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f45179ce028e..b4c737a11408 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -177,7 +177,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) static int fill_pid(pid_t pid, struct task_struct *pidtsk, struct taskstats *stats) { - int rc; + int rc = 0; struct task_struct *tsk = pidtsk; if (!pidtsk) { @@ -196,12 +196,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, * Each accounting subsystem adds calls to its functions to * fill in relevant parts of struct taskstsats as follows * - * rc = per-task-foo(stats, tsk); - * if (rc) - * goto err; + * per-task-foo(stats, tsk); */ - rc = delayacct_add_tsk(stats, tsk); + delayacct_add_tsk(stats, tsk); stats->version = TASKSTATS_VERSION; /* Define err: label here if needed */ -- cgit v1.2.3 From d94a041519f3ab1ac023bf917619cd8c4a7d3c01 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Sun, 30 Jul 2006 03:03:11 -0700 Subject: [PATCH] taskstats: free skb, avoid returns in send_cpu_listeners Add a missing freeing of skb in the case there are no listeners at all. Also remove the returning of error values by the function as it is unused by the sole caller. Signed-off-by: Shailabh Nagar Signed-off-by: Chandra Seetharaman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel/taskstats.c') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b4c737a11408..e78187657330 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -121,46 +121,45 @@ static int send_reply(struct sk_buff *skb, pid_t pid) /* * Send taskstats data in @skb to listeners registered for @cpu's exit data */ -static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); struct listener_list *listeners; struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); - int rc, ret, delcount = 0; + int rc, delcount = 0; rc = genlmsg_end(skb, reply); if (rc < 0) { nlmsg_free(skb); - return rc; + return; } rc = 0; listeners = &per_cpu(listener_array, cpu); down_read(&listeners->sem); - list_for_each_entry_safe(s, tmp, &listeners->list, list) { + list_for_each_entry(s, &listeners->list, list) { skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { skb_next = skb_clone(skb_cur, GFP_KERNEL); - if (!skb_next) { - nlmsg_free(skb_cur); - rc = -ENOMEM; + if (!skb_next) break; - } } - ret = genlmsg_unicast(skb_cur, s->pid); - if (ret == -ECONNREFUSED) { + rc = genlmsg_unicast(skb_cur, s->pid); + if (rc == -ECONNREFUSED) { s->valid = 0; delcount++; - rc = ret; } skb_cur = skb_next; } up_read(&listeners->sem); + if (skb_cur) + nlmsg_free(skb_cur); + if (!delcount) - return rc; + return; /* Delete invalidated entries */ down_write(&listeners->sem); @@ -171,7 +170,6 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) } } up_write(&listeners->sem); - return rc; } static int fill_pid(pid_t pid, struct task_struct *pidtsk, -- cgit v1.2.3