diff options
Diffstat (limited to 'kernel')
46 files changed, 2175 insertions, 782 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb05cd05d237..ff4dc02ce170 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o obj-$(CONFIG_SMP) += cpu.o spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KALLSYMS) += kallsyms.o @@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o +obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o diff --git a/kernel/acct.c b/kernel/acct.c index 4168f631868e..b756f527497e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -165,7 +165,7 @@ out: } /* - * Close the old accouting file (if currently open) and then replace + * Close the old accounting file (if currently open) and then replace * it with file (if non-NULL). * * NOTE: acct_globals.lock MUST be held on entry and exit. @@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file) } } -/* - * sys_acct() is the only system call needed to implement process - * accounting. It takes the name of the file where accounting records - * should be written. If the filename is NULL, accounting will be - * shutdown. +/** + * sys_acct - enable/disable process accounting + * @name: file name for accounting records or NULL to shutdown accounting + * + * Returns 0 for success or negative errno values for failure. + * + * sys_acct() is the only system call needed to implement process + * accounting. It takes the name of the file where accounting records + * should be written. If the filename is NULL, accounting will be + * shutdown. */ asmlinkage long sys_acct(const char __user *name) { @@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name) return (PTR_ERR(tmp)); } /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(tmp, O_WRONLY|O_APPEND, 0); + file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); putname(tmp); if (IS_ERR(file)) { return (PTR_ERR(file)); @@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name) return (0); } -/* - * If the accouting is turned on for a file in the filesystem pointed - * to by sb, turn accouting off. +/** + * acct_auto_close - turn off a filesystem's accounting if it is on + * @sb: super block for the filesystem + * + * If the accounting is turned on for a file in the filesystem pointed + * to by sb, turn accounting off. */ void acct_auto_close(struct super_block *sb) { @@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file) set_fs(fs); } -/* +/** * acct_process - now just a wrapper around do_acct_process + * @exitcode: task exit code + * + * handles process accounting for an exiting task */ void acct_process(long exitcode) { @@ -530,9 +541,9 @@ void acct_process(long exitcode) } -/* - * acct_update_integrals - * - update mm integral fields in task_struct +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting */ void acct_update_integrals(struct task_struct *tsk) { @@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk) } } -/* - * acct_clear_integrals - * - clear the mm integral fields in task_struct +/** + * acct_clear_integrals - clear the mm integral fields in task_struct + * @tsk: task_struct whose accounting fields are cleared */ void acct_clear_integrals(struct task_struct *tsk) { diff --git a/kernel/audit.c b/kernel/audit.c index ef35166fdc29..83096b67510a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -79,6 +79,8 @@ static int audit_rate_limit; /* Number of outstanding audit_buffers allowed. */ static int audit_backlog_limit = 64; +static int audit_backlog_wait_time = 60 * HZ; +static int audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ uid_t audit_sig_uid = -1; @@ -106,18 +108,12 @@ static LIST_HEAD(audit_freelist); static struct sk_buff_head audit_skb_queue; static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); - -/* There are three lists of rules -- one to search at task creation - * time, one to search at syscall entry time, and another to search at - * syscall exit time. */ -static LIST_HEAD(audit_tsklist); -static LIST_HEAD(audit_entlist); -static LIST_HEAD(audit_extlist); +static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); /* The netlink socket is only to be read by 1 CPU, which lets us assume * that list additions and deletions never happen simultaneously in * auditsc.c */ -static DECLARE_MUTEX(audit_netlink_sem); +DECLARE_MUTEX(audit_netlink_sem); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -137,6 +133,7 @@ struct audit_buffer { struct list_head list; struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ + int gfp_mask; }; static void audit_set_pid(struct audit_buffer *ab, pid_t pid) @@ -145,11 +142,6 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid) nlh->nlmsg_pid = pid; } -struct audit_entry { - struct list_head list; - struct audit_rule rule; -}; - static void audit_panic(const char *message) { switch (audit_failure) @@ -233,7 +225,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid) { int old = audit_rate_limit; audit_rate_limit = limit; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_rate_limit=%d old=%d by auid=%u", audit_rate_limit, old, loginuid); return old; @@ -243,7 +235,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid) { int old = audit_backlog_limit; audit_backlog_limit = limit; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_backlog_limit=%d old=%d by auid=%u", audit_backlog_limit, old, loginuid); return old; @@ -255,7 +247,7 @@ static int audit_set_enabled(int state, uid_t loginuid) if (state != 0 && state != 1) return -EINVAL; audit_enabled = state; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_enabled=%d old=%d by auid=%u", audit_enabled, old, loginuid); return old; @@ -269,7 +261,7 @@ static int audit_set_failure(int state, uid_t loginuid) && state != AUDIT_FAIL_PANIC) return -EINVAL; audit_failure = state; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_failure=%d old=%d by auid=%u", audit_failure, old, loginuid); return old; @@ -281,6 +273,7 @@ int kauditd_thread(void *dummy) while (1) { skb = skb_dequeue(&audit_skb_queue); + wake_up(&audit_backlog_wait); if (skb) { if (audit_pid) { int err = netlink_unicast(audit_sock, skb, audit_pid, 0); @@ -290,7 +283,7 @@ int kauditd_thread(void *dummy) audit_pid = 0; } } else { - printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0)); + printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); kfree_skb(skb); } } else { @@ -423,7 +416,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; audit_pid = status_get->pid; - audit_log(NULL, AUDIT_CONFIG_CHANGE, + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, "audit_pid=%d old=%d by auid=%u", audit_pid, old, loginuid); } @@ -435,15 +428,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: - ab = audit_log_start(NULL, msg_type); - if (!ab) - break; /* audit_panic has been called */ - audit_log_format(ab, - "user pid=%d uid=%u auid=%u" - " msg='%.1024s'", - pid, uid, loginuid, (char *)data); - audit_set_pid(ab, pid); - audit_log_end(ab); + if (!audit_enabled && msg_type != AUDIT_USER_AVC) + return 0; + + err = audit_filter_user(&NETLINK_CB(skb), msg_type); + if (err == 1) { + err = 0; + ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + if (ab) { + audit_log_format(ab, + "user pid=%d uid=%u auid=%u msg='%.1024s'", + pid, uid, loginuid, (char *)data); + audit_set_pid(ab, pid); + audit_log_end(ab); + } + } break; case AUDIT_ADD: case AUDIT_DEL: @@ -514,7 +513,8 @@ static int __init audit_init(void) { printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); - audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); + audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, + THIS_MODULE); if (!audit_sock) audit_panic("cannot initialize netlink socket"); @@ -522,7 +522,7 @@ static int __init audit_init(void) skb_queue_head_init(&audit_skb_queue); audit_initialized = 1; audit_enabled = audit_default; - audit_log(NULL, AUDIT_KERNEL, "initialized"); + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); return 0; } __initcall(audit_init); @@ -560,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab) } static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, - int gfp_mask, int type) + unsigned int __nocast gfp_mask, int type) { unsigned long flags; struct audit_buffer *ab = NULL; @@ -586,6 +586,7 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, goto err; ab->ctx = ctx; + ab->gfp_mask = gfp_mask; nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0)); nlh->nlmsg_type = type; nlh->nlmsg_flags = 0; @@ -605,26 +606,27 @@ err: * (timestamp,serial) tuple is unique for each syscall and is live from * syscall entry to syscall exit. * - * Atomic values are only guaranteed to be 24-bit, so we count down. - * * NOTE: Another possibility is to store the formatted records off the * audit context (for those records that have a context), and emit them * all at syscall exit. However, this could delay the reporting of * significant errors until syscall exit (or never, if the system * halts). */ + unsigned int audit_serial(void) { - static atomic_t serial = ATOMIC_INIT(0xffffff); - unsigned int a, b; + static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; + static unsigned int serial = 0; + + unsigned long flags; + unsigned int ret; + spin_lock_irqsave(&serial_lock, flags); do { - a = atomic_read(&serial); - if (atomic_dec_and_test(&serial)) - atomic_set(&serial, 0xffffff); - b = atomic_read(&serial); - } while (b != a - 1); + ret = ++serial; + } while (unlikely(!ret)); + spin_unlock_irqrestore(&serial_lock, flags); - return 0xffffff - b; + return ret; } static inline void audit_get_stamp(struct audit_context *ctx, @@ -644,17 +646,43 @@ static inline void audit_get_stamp(struct audit_context *ctx, * syscall, then the syscall is marked as auditable and an audit record * will be written at syscall exit. If there is no associated task, tsk * should be NULL. */ -struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) + +struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, + int type) { struct audit_buffer *ab = NULL; struct timespec t; unsigned int serial; + int reserve; + unsigned long timeout_start = jiffies; if (!audit_initialized) return NULL; - if (audit_backlog_limit - && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) { + if (gfp_mask & __GFP_WAIT) + reserve = 0; + else + reserve = 5; /* Allow atomic callers to go up to five + entries over the normal backlog limit */ + + while (audit_backlog_limit + && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { + if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time + && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { + + /* Wait for auditd to drain the queue a little */ + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&audit_backlog_wait, &wait); + + if (audit_backlog_limit && + skb_queue_len(&audit_skb_queue) > audit_backlog_limit) + schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&audit_backlog_wait, &wait); + continue; + } if (audit_rate_check()) printk(KERN_WARNING "audit: audit_backlog=%d > " @@ -662,10 +690,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type) skb_queue_len(&audit_skb_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); + audit_backlog_wait_time = audit_backlog_wait_overflow; + wake_up(&audit_backlog_wait); return NULL; } - ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type); + ab = audit_buffer_alloc(ctx, gfp_mask, type); if (!ab) { audit_log_lost("out of memory in audit_log_start"); return NULL; @@ -689,7 +719,7 @@ static inline int audit_expand(struct audit_buffer *ab, int extra) { struct sk_buff *skb = ab->skb; int ret = pskb_expand_head(skb, skb_headroom(skb), extra, - GFP_ATOMIC); + ab->gfp_mask); if (ret < 0) { audit_log_lost("out of memory in audit_expand"); return 0; @@ -808,7 +838,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, audit_log_format(ab, " %s", prefix); /* We will allow 11 spaces for ' (deleted)' to be appended */ - path = kmalloc(PATH_MAX+11, GFP_KERNEL); + path = kmalloc(PATH_MAX+11, ab->gfp_mask); if (!path) { audit_log_format(ab, "<no memory>"); return; @@ -840,7 +870,7 @@ void audit_log_end(struct audit_buffer *ab) ab->skb = NULL; wake_up_interruptible(&kauditd_wait); } else { - printk("%s\n", ab->skb->data + NLMSG_SPACE(0)); + printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0)); } } audit_buffer_free(ab); @@ -849,12 +879,13 @@ void audit_log_end(struct audit_buffer *ab) /* Log an audit record. This is a convenience function that calls * audit_log_start, audit_log_vformat, and audit_log_end. It may be * called in any context. */ -void audit_log(struct audit_context *ctx, int type, const char *fmt, ...) +void audit_log(struct audit_context *ctx, int gfp_mask, int type, + const char *fmt, ...) { struct audit_buffer *ab; va_list args; - ab = audit_log_start(ctx, type); + ab = audit_log_start(ctx, gfp_mask, type); if (ab) { va_start(args, fmt); audit_log_vformat(ab, fmt, args); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e75f84e1a1a0..88696f639aab 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -39,6 +39,9 @@ #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> +#include <linux/kthread.h> +#include <linux/netlink.h> +#include <linux/compiler.h> #include <asm/unistd.h> /* 0 = no checking @@ -95,6 +98,7 @@ struct audit_names { uid_t uid; gid_t gid; dev_t rdev; + unsigned flags; }; struct audit_aux_data { @@ -167,9 +171,16 @@ struct audit_context { /* There are three lists of rules -- one to search at task creation * time, one to search at syscall entry time, and another to search at * syscall exit time. */ -static LIST_HEAD(audit_tsklist); -static LIST_HEAD(audit_entlist); -static LIST_HEAD(audit_extlist); +static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_filter_list[0]), + LIST_HEAD_INIT(audit_filter_list[1]), + LIST_HEAD_INIT(audit_filter_list[2]), + LIST_HEAD_INIT(audit_filter_list[3]), + LIST_HEAD_INIT(audit_filter_list[4]), +#if AUDIT_NR_FILTERS != 5 +#error Fix audit_filter_list initialiser +#endif +}; struct audit_entry { struct list_head list; @@ -179,9 +190,36 @@ struct audit_entry { extern int audit_pid; +/* Copy rule from user-space to kernel-space. Called from + * audit_add_rule during AUDIT_ADD. */ +static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) +{ + int i; + + if (s->action != AUDIT_NEVER + && s->action != AUDIT_POSSIBLE + && s->action != AUDIT_ALWAYS) + return -1; + if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) + return -1; + if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS) + return -1; + + d->flags = s->flags; + d->action = s->action; + d->field_count = s->field_count; + for (i = 0; i < d->field_count; i++) { + d->fields[i] = s->fields[i]; + d->values[i] = s->values[i]; + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; + return 0; +} + /* Check to see if two rules are identical. It is called from + * audit_add_rule during AUDIT_ADD and * audit_del_rule during AUDIT_DEL. */ -static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) +static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) { int i; @@ -210,19 +248,37 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b) /* Note that audit_add_rule and audit_del_rule are called via * audit_receive() in audit.c, and are protected by * audit_netlink_sem. */ -static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) +static inline int audit_add_rule(struct audit_rule *rule, + struct list_head *list) { - if (entry->rule.flags & AUDIT_PREPEND) { - entry->rule.flags &= ~AUDIT_PREPEND; + struct audit_entry *entry; + + /* Do not use the _rcu iterator here, since this is the only + * addition routine. */ + list_for_each_entry(entry, list, list) { + if (!audit_compare_rule(rule, &entry->rule)) { + return -EEXIST; + } + } + + if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) + return -ENOMEM; + if (audit_copy_rule(&entry->rule, rule)) { + kfree(entry); + return -EINVAL; + } + + if (entry->rule.flags & AUDIT_FILTER_PREPEND) { + entry->rule.flags &= ~AUDIT_FILTER_PREPEND; list_add_rcu(&entry->list, list); } else { list_add_tail_rcu(&entry->list, list); } + return 0; } -static void audit_free_rule(struct rcu_head *head) +static inline void audit_free_rule(struct rcu_head *head) { struct audit_entry *e = container_of(head, struct audit_entry, rcu); kfree(e); @@ -245,82 +301,82 @@ static inline int audit_del_rule(struct audit_rule *rule, return 0; } } - return -EFAULT; /* No matching rule */ + return -ENOENT; /* No matching rule */ } -/* Copy rule from user-space to kernel-space. Called during - * AUDIT_ADD. */ -static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s) +static int audit_list_rules(void *_dest) { + int pid, seq; + int *dest = _dest; + struct audit_entry *entry; int i; - if (s->action != AUDIT_NEVER - && s->action != AUDIT_POSSIBLE - && s->action != AUDIT_ALWAYS) - return -1; - if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS) - return -1; + pid = dest[0]; + seq = dest[1]; + kfree(dest); - d->flags = s->flags; - d->action = s->action; - d->field_count = s->field_count; - for (i = 0; i < d->field_count; i++) { - d->fields[i] = s->fields[i]; - d->values[i] = s->values[i]; + down(&audit_netlink_sem); + + /* The *_rcu iterators not needed here because we are + always called with audit_netlink_sem held. */ + for (i=0; i<AUDIT_NR_FILTERS; i++) { + list_for_each_entry(entry, &audit_filter_list[i], list) + audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + &entry->rule, sizeof(entry->rule)); } - for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i]; + audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + + up(&audit_netlink_sem); return 0; } int audit_receive_filter(int type, int pid, int uid, int seq, void *data, uid_t loginuid) { - u32 flags; - struct audit_entry *entry; + struct task_struct *tsk; + int *dest; int err = 0; + unsigned listnr; switch (type) { case AUDIT_LIST: - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_sem held. */ - list_for_each_entry(entry, &audit_tsklist, list) - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, - &entry->rule, sizeof(entry->rule)); - list_for_each_entry(entry, &audit_entlist, list) - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, - &entry->rule, sizeof(entry->rule)); - list_for_each_entry(entry, &audit_extlist, list) - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, - &entry->rule, sizeof(entry->rule)); - audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + /* We can't just spew out the rules here because we might fill + * the available socket buffer space and deadlock waiting for + * auditctl to read from it... which isn't ever going to + * happen if we're actually running in the context of auditctl + * trying to _send_ the stuff */ + + dest = kmalloc(2 * sizeof(int), GFP_KERNEL); + if (!dest) + return -ENOMEM; + dest[0] = pid; + dest[1] = seq; + + tsk = kthread_run(audit_list_rules, dest, "audit_list_rules"); + if (IS_ERR(tsk)) { + kfree(dest); + err = PTR_ERR(tsk); + } break; case AUDIT_ADD: - if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL))) - return -ENOMEM; - if (audit_copy_rule(&entry->rule, data)) { - kfree(entry); + listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; + if (listnr >= AUDIT_NR_FILTERS) return -EINVAL; - } - flags = entry->rule.flags; - if (!err && (flags & AUDIT_PER_TASK)) - err = audit_add_rule(entry, &audit_tsklist); - if (!err && (flags & AUDIT_AT_ENTRY)) - err = audit_add_rule(entry, &audit_entlist); - if (!err && (flags & AUDIT_AT_EXIT)) - err = audit_add_rule(entry, &audit_extlist); - audit_log(NULL, AUDIT_CONFIG_CHANGE, - "auid=%u added an audit rule\n", loginuid); + + err = audit_add_rule(data, &audit_filter_list[listnr]); + if (!err) + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u added an audit rule\n", loginuid); break; case AUDIT_DEL: - flags =((struct audit_rule *)data)->flags; - if (!err && (flags & AUDIT_PER_TASK)) - err = audit_del_rule(data, &audit_tsklist); - if (!err && (flags & AUDIT_AT_ENTRY)) - err = audit_del_rule(data, &audit_entlist); - if (!err && (flags & AUDIT_AT_EXIT)) - err = audit_del_rule(data, &audit_extlist); - audit_log(NULL, AUDIT_CONFIG_CHANGE, - "auid=%u removed an audit rule\n", loginuid); + listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND; + if (listnr >= AUDIT_NR_FILTERS) + return -EINVAL; + + err = audit_del_rule(data, &audit_filter_list[listnr]); + if (!err) + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "auid=%u removed an audit rule\n", loginuid); break; default: return -EINVAL; @@ -384,8 +440,12 @@ static int audit_filter_rules(struct task_struct *tsk, result = (ctx->return_code == value); break; case AUDIT_SUCCESS: - if (ctx && ctx->return_valid) - result = (ctx->return_valid == AUDITSC_SUCCESS); + if (ctx && ctx->return_valid) { + if (value) + result = (ctx->return_valid == AUDITSC_SUCCESS); + else + result = (ctx->return_valid == AUDITSC_FAILURE); + } break; case AUDIT_DEVMAJOR: if (ctx) { @@ -454,7 +514,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) enum audit_state state; rcu_read_lock(); - list_for_each_entry_rcu(e, &audit_tsklist, list) { + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { rcu_read_unlock(); return state; @@ -474,20 +534,84 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, struct list_head *list) { struct audit_entry *e; + enum audit_state state; + + if (audit_pid && tsk->tgid == audit_pid) + return AUDIT_DISABLED; + + rcu_read_lock(); + if (!list_empty(list)) { + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit + && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + rcu_read_unlock(); + return state; + } + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +static int audit_filter_user_rules(struct netlink_skb_parms *cb, + struct audit_rule *rule, + enum audit_state *state) +{ + int i; + + for (i = 0; i < rule->field_count; i++) { + u32 field = rule->fields[i] & ~AUDIT_NEGATE; + u32 value = rule->values[i]; + int result = 0; + + switch (field) { + case AUDIT_PID: + result = (cb->creds.pid == value); + break; + case AUDIT_UID: + result = (cb->creds.uid == value); + break; + case AUDIT_GID: + result = (cb->creds.gid == value); + break; + case AUDIT_LOGINUID: + result = (cb->loginuid == value); + break; + } + + if (rule->fields[i] & AUDIT_NEGATE) + result = !result; + if (!result) + return 0; + } + switch (rule->action) { + case AUDIT_NEVER: *state = AUDIT_DISABLED; break; + case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; + case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + } + return 1; +} + +int audit_filter_user(struct netlink_skb_parms *cb, int type) +{ + struct audit_entry *e; enum audit_state state; - int word = AUDIT_WORD(ctx->major); - int bit = AUDIT_BIT(ctx->major); + int ret = 1; rcu_read_lock(); - list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit - && audit_filter_rules(tsk, &e->rule, ctx, &state)) { - rcu_read_unlock(); - return state; + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { + if (audit_filter_user_rules(cb, &e->rule, &state)) { + if (state == AUDIT_DISABLED) + ret = 0; + break; } } rcu_read_unlock(); - return AUDIT_BUILD_CONTEXT; + + return ret; /* Audit by default */ } /* This should be called with task_lock() held. */ @@ -504,7 +628,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, if (context->in_syscall && !context->auditable) { enum audit_state state; - state = audit_filter_syscall(tsk, context, &audit_extlist); + state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); if (state == AUDIT_RECORD_CONTEXT) context->auditable = 1; } @@ -679,13 +803,13 @@ static void audit_log_task_info(struct audit_buffer *ab) up_read(&mm->mmap_sem); } -static void audit_log_exit(struct audit_context *context) +static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask) { int i; struct audit_buffer *ab; struct audit_aux_data *aux; - ab = audit_log_start(context, AUDIT_SYSCALL); + ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL); if (!ab) return; /* audit_panic has been called */ audit_log_format(ab, "arch=%x syscall=%d", @@ -717,7 +841,7 @@ static void audit_log_exit(struct audit_context *context) for (aux = context->aux; aux; aux = aux->next) { - ab = audit_log_start(context, aux->type); + ab = audit_log_start(context, GFP_KERNEL, aux->type); if (!ab) continue; /* audit_panic has been called */ @@ -754,14 +878,14 @@ static void audit_log_exit(struct audit_context *context) } if (context->pwd && context->pwdmnt) { - ab = audit_log_start(context, AUDIT_CWD); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); if (ab) { audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt); audit_log_end(ab); } } for (i = 0; i < context->name_count; i++) { - ab = audit_log_start(context, AUDIT_PATH); + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) continue; /* audit_panic has been called */ @@ -770,6 +894,8 @@ static void audit_log_exit(struct audit_context *context) audit_log_format(ab, " name="); audit_log_untrustedstring(ab, context->names[i].name); } + audit_log_format(ab, " flags=%x\n", context->names[i].flags); + if (context->names[i].ino != (unsigned long)-1) audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o" " ouid=%u ogid=%u rdev=%02x:%02x", @@ -799,9 +925,11 @@ void audit_free(struct task_struct *tsk) return; /* Check for system calls that do not go through the exit - * function (e.g., exit_group), then free context block. */ - if (context->in_syscall && context->auditable && context->pid != audit_pid) - audit_log_exit(context); + * function (e.g., exit_group), then free context block. + * We use GFP_ATOMIC here because we might be doing this + * in the context of the idle thread */ + if (context->in_syscall && context->auditable) + audit_log_exit(context, GFP_ATOMIC); audit_free_context(context); } @@ -876,11 +1004,11 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major, state = context->state; if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) - state = audit_filter_syscall(tsk, context, &audit_entlist); + state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); if (likely(state == AUDIT_DISABLED)) return; - context->serial = audit_serial(); + context->serial = 0; context->ctime = CURRENT_TIME; context->in_syscall = 1; context->auditable = !!(state == AUDIT_RECORD_CONTEXT); @@ -903,10 +1031,10 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) /* Not having a context here is ok, since the parent may have * called __put_task_struct. */ if (likely(!context)) - return; + goto out; - if (context->in_syscall && context->auditable && context->pid != audit_pid) - audit_log_exit(context); + if (context->in_syscall && context->auditable) + audit_log_exit(context, GFP_KERNEL); context->in_syscall = 0; context->auditable = 0; @@ -919,9 +1047,9 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code) } else { audit_free_names(context); audit_free_aux(context); - audit_zero_context(context, context->state); tsk->audit_context = context; } + out: put_task_struct(tsk); } @@ -996,7 +1124,7 @@ void audit_putname(const char *name) /* Store the inode and device from a lookup. Called from * fs/namei.c:path_lookup(). */ -void audit_inode(const char *name, const struct inode *inode) +void audit_inode(const char *name, const struct inode *inode, unsigned flags) { int idx; struct audit_context *context = current->audit_context; @@ -1022,17 +1150,20 @@ void audit_inode(const char *name, const struct inode *inode) ++context->ino_count; #endif } - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; + context->names[idx].flags = flags; + context->names[idx].ino = inode->i_ino; + context->names[idx].dev = inode->i_sb->s_dev; + context->names[idx].mode = inode->i_mode; + context->names[idx].uid = inode->i_uid; + context->names[idx].gid = inode->i_gid; + context->names[idx].rdev = inode->i_rdev; } void auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial) { + if (!ctx->serial) + ctx->serial = audit_serial(); t->tv_sec = ctx->ctime.tv_sec; t->tv_nsec = ctx->ctime.tv_nsec; *serial = ctx->serial; @@ -1044,7 +1175,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) if (task->audit_context) { struct audit_buffer *ab; - ab = audit_log_start(NULL, AUDIT_LOGIN); + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); if (ab) { audit_log_format(ab, "login pid=%d uid=%u " "old auid=%u new auid=%u", @@ -1153,7 +1284,7 @@ void audit_signal_info(int sig, struct task_struct *t) extern pid_t audit_sig_pid; extern uid_t audit_sig_uid; - if (unlikely(audit_pid && t->pid == audit_pid)) { + if (unlikely(audit_pid && t->tgid == audit_pid)) { if (sig == SIGTERM || sig == SIGHUP) { struct audit_context *ctx = current->audit_context; audit_sig_pid = current->pid; diff --git a/kernel/capability.c b/kernel/capability.c index 64db1ee820c2..8986a37a67ea 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -31,8 +31,14 @@ static DEFINE_SPINLOCK(task_capability_lock); * uninteresting and/or not to be changed. */ -/* +/** * sys_capget - get the capabilities of a given process. + * @header: pointer to struct that contains capability version and + * target pid data + * @dataptr: pointer to struct that contains the effective, permitted, + * and inheritable capabilities that are returned + * + * Returns 0 on success and < 0 on error. */ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) { @@ -141,8 +147,14 @@ static inline int cap_set_all(kernel_cap_t *effective, return ret; } -/* - * sys_capset - set capabilities for a given process, all processes, or all +/** + * sys_capset - set capabilities for a process or a group of processes + * @header: pointer to struct that contains capability version and + * target pid data + * @data: pointer to struct that contains the effective, permitted, + * and inheritable capabilities + * + * Set capabilities for a given process, all processes, or all * processes in a given process group. * * The restrictions on setting capabilities are specified as: @@ -152,6 +164,8 @@ static inline int cap_set_all(kernel_cap_t *effective, * I: any raised capabilities must be a subset of the (old current) permitted * P: any raised capabilities must be a subset of the (old current) permitted * E: must be set to a subset of (new target) permitted + * + * Returns 0 on success and < 0 on error. */ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) { diff --git a/kernel/compat.c b/kernel/compat.c index ddfcaaa86623..102296e21ea8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) if (!time_after(expire, now)) return 0; - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire - now); + expire = schedule_timeout_interruptible(expire - now); if (expire == 0) return 0; @@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, return -EINVAL; expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); + expire = schedule_timeout_interruptible(expire); if (expire == 0) return 0; @@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); spin_lock_irq(¤t->sighand->siglock); sig = dequeue_signal(current, &s, &info); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 984c0bf3807f..79866bc6b3a1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL; */ static DECLARE_MUTEX(cpuset_sem); +static struct task_struct *cpuset_sem_owner; +static int cpuset_sem_depth; + +/* + * The global cpuset semaphore cpuset_sem can be needed by the + * memory allocator to update a tasks mems_allowed (see the calls + * to cpuset_update_current_mems_allowed()) or to walk up the + * cpuset hierarchy to find a mem_exclusive cpuset see the calls + * to cpuset_excl_nodes_overlap()). + * + * But if the memory allocation is being done by cpuset.c code, it + * usually already holds cpuset_sem. Double tripping on a kernel + * semaphore deadlocks the current task, and any other task that + * subsequently tries to obtain the lock. + * + * Run all up's and down's on cpuset_sem through the following + * wrappers, which will detect this nested locking, and avoid + * deadlocking. + */ + +static inline void cpuset_down(struct semaphore *psem) +{ + if (cpuset_sem_owner != current) { + down(psem); + cpuset_sem_owner = current; + } + cpuset_sem_depth++; +} + +static inline void cpuset_up(struct semaphore *psem) +{ + if (--cpuset_sem_depth == 0) { + cpuset_sem_owner = NULL; + up(psem); + } +} /* * A couple of forward declarations required, due to cyclic reference loop: @@ -398,21 +434,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * - * Note final arg to call_usermodehelper() is 0 - that means - * don't wait. Since we are holding the global cpuset_sem here, - * and we are asking another thread (started from keventd) to rmdir a - * cpuset, we can't wait - or we'd deadlock with the removing thread - * on cpuset_sem. + * The final arg to call_usermodehelper() is 0, which means don't + * wait. The separate /sbin/cpuset_release_agent task is forked by + * call_usermodehelper(), then control in this thread returns here, + * without waiting for the release agent task. We don't bother to + * wait because the caller of this routine has no use for the exit + * status of the /sbin/cpuset_release_agent task, so no sense holding + * our caller up for that. + * + * The simple act of forking that task might require more memory, + * which might need cpuset_sem. So this routine must be called while + * cpuset_sem is not held, to avoid a possible deadlock. See also + * comments for check_for_release(), below. */ -static int cpuset_release_agent(char *cpuset_str) +static void cpuset_release_agent(const char *pathbuf) { char *argv[3], *envp[3]; int i; + if (!pathbuf) + return; + i = 0; argv[i++] = "/sbin/cpuset_release_agent"; - argv[i++] = cpuset_str; + argv[i++] = (char *)pathbuf; argv[i] = NULL; i = 0; @@ -421,17 +467,29 @@ static int cpuset_release_agent(char *cpuset_str) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; - return call_usermodehelper(argv[0], argv, envp, 0); + call_usermodehelper(argv[0], argv, envp, 0); + kfree(pathbuf); } /* * Either cs->count of using tasks transitioned to zero, or the * cs->children list of child cpusets just became empty. If this * cs is notify_on_release() and now both the user count is zero and - * the list of children is empty, send notice to user land. + * the list of children is empty, prepare cpuset path in a kmalloc'd + * buffer, to be returned via ppathbuf, so that the caller can invoke + * cpuset_release_agent() with it later on, once cpuset_sem is dropped. + * Call here with cpuset_sem held. + * + * This check_for_release() routine is responsible for kmalloc'ing + * pathbuf. The above cpuset_release_agent() is responsible for + * kfree'ing pathbuf. The caller of these routines is responsible + * for providing a pathbuf pointer, initialized to NULL, then + * calling check_for_release() with cpuset_sem held and the address + * of the pathbuf pointer, then dropping cpuset_sem, then calling + * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ -static void check_for_release(struct cpuset *cs) +static void check_for_release(struct cpuset *cs, char **ppathbuf) { if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && list_empty(&cs->children)) { @@ -441,10 +499,9 @@ static void check_for_release(struct cpuset *cs) if (!buf) return; if (cpuset_path(cs, buf, PAGE_SIZE) < 0) - goto out; - cpuset_release_agent(buf); -out: - kfree(buf); + kfree(buf); + else + *ppathbuf = buf; } } @@ -501,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * Refresh current tasks mems_allowed and mems_generation from * current tasks cpuset. Call with cpuset_sem held. * - * Be sure to call refresh_mems() on any cpuset operation which - * (1) holds cpuset_sem, and (2) might possibly alloc memory. - * Call after obtaining cpuset_sem lock, before any possible - * allocation. Otherwise one risks trying to allocate memory - * while the task cpuset_mems_generation is not the same as - * the mems_generation in its cpuset, which would deadlock on - * cpuset_sem in cpuset_update_current_mems_allowed(). - * - * Since we hold cpuset_sem, once refresh_mems() is called, the - * test (current->cpuset_mems_generation != cs->mems_generation) - * in cpuset_update_current_mems_allowed() will remain false, - * until we drop cpuset_sem. Anyone else who would change our - * cpusets mems_generation needs to lock cpuset_sem first. + * This routine is needed to update the per-task mems_allowed + * data, within the tasks context, when it is trying to allocate + * memory (in various mm/mempolicy.c routines) and notices + * that some other task has been modifying its cpuset. */ static void refresh_mems(void) @@ -606,6 +654,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * Call with cpuset_sem held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ + static void update_cpu_domains(struct cpuset *cur) { struct cpuset *c, *par = cur->parent; @@ -727,14 +776,14 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return 0; } -static int attach_task(struct cpuset *cs, char *buf) +static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) { pid_t pid; struct task_struct *tsk; struct cpuset *oldcs; cpumask_t cpus; - if (sscanf(buf, "%d", &pid) != 1) + if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; @@ -777,7 +826,7 @@ static int attach_task(struct cpuset *cs, char *buf) put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) - check_for_release(oldcs); + check_for_release(oldcs, ppathbuf); return 0; } @@ -801,6 +850,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us struct cftype *cft = __d_cft(file->f_dentry); cpuset_filetype_t type = cft->private; char *buffer; + char *pathbuf = NULL; int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ @@ -817,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us } buffer[nbytes] = 0; /* nul-terminate */ - down(&cpuset_sem); + cpuset_down(&cpuset_sem); if (is_removed(cs)) { retval = -ENODEV; @@ -841,7 +891,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; case FILE_TASKLIST: - retval = attach_task(cs, buffer); + retval = attach_task(cs, buffer, &pathbuf); break; default: retval = -EINVAL; @@ -851,7 +901,8 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us if (retval == 0) retval = nbytes; out2: - up(&cpuset_sem); + cpuset_up(&cpuset_sem); + cpuset_release_agent(pathbuf); out1: kfree(buffer); return retval; @@ -890,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) { cpumask_t mask; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); mask = cs->cpus_allowed; - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return cpulist_scnprintf(page, PAGE_SIZE, mask); } @@ -901,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { nodemask_t mask; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); mask = cs->mems_allowed; - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return nodelist_scnprintf(page, PAGE_SIZE, mask); } @@ -948,6 +999,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, *s++ = '\n'; *s = '\0'; + /* Do nothing if *ppos is at the eof or beyond the eof. */ + if (s - page <= *ppos) + return 0; + start = page + *ppos; n = s - start; retval = n - copy_to_user(buf, start, min(n, nbytes)); @@ -1306,8 +1361,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) if (!cs) return -ENOMEM; - down(&cpuset_sem); - refresh_mems(); + cpuset_down(&cpuset_sem); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); @@ -1332,14 +1386,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) * will down() this new directory's i_sem and if we race with * another mkdir, we might deadlock. */ - up(&cpuset_sem); + cpuset_up(&cpuset_sem); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0; err: list_del(&cs->sibling); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); kfree(cs); return err; } @@ -1357,17 +1411,17 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cpuset *cs = dentry->d_fsdata; struct dentry *d; struct cpuset *parent; + char *pathbuf = NULL; /* the vfs holds both inode->i_sem already */ - down(&cpuset_sem); - refresh_mems(); + cpuset_down(&cpuset_sem); if (atomic_read(&cs->count) > 0) { - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return -EBUSY; } if (!list_empty(&cs->children)) { - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return -EBUSY; } parent = cs->parent; @@ -1376,14 +1430,15 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ if (list_empty(&parent->children)) - check_for_release(parent); + check_for_release(parent, &pathbuf); spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); + cpuset_release_agent(pathbuf); return 0; } @@ -1440,10 +1495,10 @@ void __init cpuset_init_smp(void) /** * cpuset_fork - attach newly forked task to its parents cpuset. - * @p: pointer to task_struct of forking parent process. + * @tsk: pointer to task_struct of forking parent process. * * Description: By default, on fork, a task inherits its - * parents cpuset. The pointer to the shared cpuset is + * parent's cpuset. The pointer to the shared cpuset is * automatically copied in fork.c by dup_task_struct(). * This cpuset_fork() routine need only increment the usage * counter in that cpuset. @@ -1471,7 +1526,6 @@ void cpuset_fork(struct task_struct *tsk) * by the cpuset_sem semaphore. If you don't hold cpuset_sem, * then a zero cpuset use count is a license to any other task to * nuke the cpuset immediately. - * **/ void cpuset_exit(struct task_struct *tsk) @@ -1484,10 +1538,13 @@ void cpuset_exit(struct task_struct *tsk) task_unlock(tsk); if (notify_on_release(cs)) { - down(&cpuset_sem); + char *pathbuf = NULL; + + cpuset_down(&cpuset_sem); if (atomic_dec_and_test(&cs->count)) - check_for_release(cs); - up(&cpuset_sem); + check_for_release(cs, &pathbuf); + cpuset_up(&cpuset_sem); + cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); } @@ -1507,11 +1564,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) { cpumask_t mask; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); task_lock((struct task_struct *)tsk); guarantee_online_cpus(tsk->cpuset, &mask); task_unlock((struct task_struct *)tsk); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); return mask; } @@ -1521,7 +1578,9 @@ void cpuset_init_current_mems_allowed(void) current->mems_allowed = NODE_MASK_ALL; } -/* +/** + * cpuset_update_current_mems_allowed - update mems parameters to new values + * * If the current tasks cpusets mems_allowed changed behind our backs, * update current->mems_allowed and mems_generation to the new value. * Do not call this routine if in_interrupt(). @@ -1534,19 +1593,26 @@ void cpuset_update_current_mems_allowed(void) if (!cs) return; /* task is exiting */ if (current->cpuset_mems_generation != cs->mems_generation) { - down(&cpuset_sem); + cpuset_down(&cpuset_sem); refresh_mems(); - up(&cpuset_sem); + cpuset_up(&cpuset_sem); } } +/** + * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed + * @nodes: pointer to a node bitmap that is and-ed with mems_allowed + */ void cpuset_restrict_to_mems_allowed(unsigned long *nodes) { bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), MAX_NUMNODES); } -/* +/** + * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed + * @zl: the zonelist to be checked + * * Are any of the nodes on zonelist zl allowed in current->mems_allowed? */ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) @@ -1563,12 +1629,113 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) } /* - * Is 'current' valid, and is zone z allowed in current->mems_allowed? + * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive + * ancestor to the specified cpuset. Call while holding cpuset_sem. + * If no ancestor is mem_exclusive (an unusual configuration), then + * returns the root cpuset. */ -int cpuset_zone_allowed(struct zone *z) +static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) +{ + while (!is_mem_exclusive(cs) && cs->parent) + cs = cs->parent; + return cs; +} + +/** + * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? + * @z: is this zone on an allowed node? + * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) + * + * If we're in interrupt, yes, we can always allocate. If zone + * z's node is in our tasks mems_allowed, yes. If it's not a + * __GFP_HARDWALL request and this zone's nodes is in the nearest + * mem_exclusive cpuset ancestor to this tasks cpuset, yes. + * Otherwise, no. + * + * GFP_USER allocations are marked with the __GFP_HARDWALL bit, + * and do not allow allocations outside the current tasks cpuset. + * GFP_KERNEL allocations are not so marked, so can escape to the + * nearest mem_exclusive ancestor cpuset. + * + * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() + * routine only calls here with __GFP_HARDWALL bit _not_ set if + * it's a GFP_KERNEL allocation, and all nodes in the current tasks + * mems_allowed came up empty on the first pass over the zonelist. + * So only GFP_KERNEL allocations, if all nodes in the cpuset are + * short of memory, might require taking the cpuset_sem semaphore. + * + * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() + * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing + * hardwall cpusets - no allocation on a node outside the cpuset is + * allowed (unless in interrupt, of course). + * + * The second loop doesn't even call here for GFP_ATOMIC requests + * (if the __alloc_pages() local variable 'wait' is set). That check + * and the checks below have the combined affect in the second loop of + * the __alloc_pages() routine that: + * in_interrupt - any node ok (current task context irrelevant) + * GFP_ATOMIC - any node ok + * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok + * GFP_USER - only nodes in current tasks mems allowed ok. + **/ + +int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) +{ + int node; /* node that zone z is on */ + const struct cpuset *cs; /* current cpuset ancestors */ + int allowed = 1; /* is allocation in zone z allowed? */ + + if (in_interrupt()) + return 1; + node = z->zone_pgdat->node_id; + if (node_isset(node, current->mems_allowed)) + return 1; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return 0; + + /* Not hardwall and node outside mems_allowed: scan up cpusets */ + cpuset_down(&cpuset_sem); + cs = current->cpuset; + if (!cs) + goto done; /* current task exiting */ + cs = nearest_exclusive_ancestor(cs); + allowed = node_isset(node, cs->mems_allowed); +done: + cpuset_up(&cpuset_sem); + return allowed; +} + +/** + * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? + * @p: pointer to task_struct of some other task. + * + * Description: Return true if the nearest mem_exclusive ancestor + * cpusets of tasks @p and current overlap. Used by oom killer to + * determine if task @p's memory usage might impact the memory + * available to the current task. + * + * Acquires cpuset_sem - not suitable for calling from a fast path. + **/ + +int cpuset_excl_nodes_overlap(const struct task_struct *p) { - return in_interrupt() || - node_isset(z->zone_pgdat->node_id, current->mems_allowed); + const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ + int overlap = 0; /* do cpusets overlap? */ + + cpuset_down(&cpuset_sem); + cs1 = current->cpuset; + if (!cs1) + goto done; /* current task exiting */ + cs2 = p->cpuset; + if (!cs2) + goto done; /* task p is exiting */ + cs1 = nearest_exclusive_ancestor(cs1); + cs2 = nearest_exclusive_ancestor(cs2); + overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); +done: + cpuset_up(&cpuset_sem); + + return overlap; } /* @@ -1589,7 +1756,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) return -ENOMEM; tsk = m->private; - down(&cpuset_sem); + cpuset_down(&cpuset_sem); task_lock(tsk); cs = tsk->cpuset; task_unlock(tsk); @@ -1604,7 +1771,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) seq_puts(m, buf); seq_putc(m, '\n'); out: - up(&cpuset_sem); + cpuset_up(&cpuset_sem); kfree(buf); return retval; } diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 459ba49e376a..334c37f5218a 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -18,7 +18,16 @@ /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; -/* +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * * Copy a page from "oldmem". For this page, there is no pte mapped * in the current kernel. We stitch up a pte, similar to kmap_atomic. */ diff --git a/kernel/exit.c b/kernel/exit.c index 9d1b10ed0135..6d2089a1bce7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize); static inline void close_files(struct files_struct * files) { int i, j; + struct fdtable *fdt; j = 0; + fdt = files_fdtable(files); for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= files->max_fdset || i >= files->max_fds) + if (i >= fdt->max_fdset || i >= fdt->max_fds) break; - set = files->open_fds->fds_bits[j++]; + set = fdt->open_fds->fds_bits[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&files->fd[i], NULL); + struct file * file = xchg(&fdt->fd[i], NULL); if (file) filp_close(file, files); } @@ -403,18 +405,22 @@ struct files_struct *get_files_struct(struct task_struct *task) void fastcall put_files_struct(struct files_struct *files) { + struct fdtable *fdt; + if (atomic_dec_and_test(&files->count)) { close_files(files); /* * Free the fd and fdset arrays if we expanded them. + * If the fdtable was embedded, pass files for freeing + * at the end of the RCU grace period. Otherwise, + * you can free files immediately. */ - if (files->fd != &files->fd_array[0]) - free_fd_array(files->fd, files->max_fds); - if (files->max_fdset > __FD_SETSIZE) { - free_fdset(files->open_fds, files->max_fdset); - free_fdset(files->close_on_exec, files->max_fdset); - } - kmem_cache_free(files_cachep, files); + fdt = files_fdtable(files); + if (fdt == &files->fdtab) + fdt->free_files = files; + else + kmem_cache_free(files_cachep, files); + free_fdtable(fdt); } } @@ -829,8 +835,10 @@ fastcall NORET_TYPE void do_exit(long code) acct_update_integrals(tsk); update_mem_hiwater(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); - if (group_dead) + if (group_dead) { + del_timer_sync(&tsk->signal->real_timer); acct_process(code); + } exit_mm(tsk); exit_sem(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index cdef6cea8900..8149f3602881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -35,6 +35,7 @@ #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/futex.h> +#include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> #include <linux/audit.h> @@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); return tsk; } @@ -208,8 +210,10 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { + long pages = vma_pages(mpnt); + mm->total_vm -= pages; __vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -vma_pages(mpnt)); + -pages); continue; } charge = 0; @@ -562,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) return 0; } -static int count_open_files(struct files_struct *files, int size) +static int count_open_files(struct fdtable *fdt) { + int size = fdt->max_fdset; int i; /* Find the last open fd */ for (i = size/(8*sizeof(long)); i > 0; ) { - if (files->open_fds->fds_bits[--i]) + if (fdt->open_fds->fds_bits[--i]) break; } i = (i+1) * 8 * sizeof(long); return i; } +static struct files_struct *alloc_files(void) +{ + struct files_struct *newf; + struct fdtable *fdt; + + newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + fdt = &newf->fdtab; + fdt->next_fd = 0; + fdt->max_fds = NR_OPEN_DEFAULT; + fdt->max_fdset = __FD_SETSIZE; + fdt->close_on_exec = &newf->close_on_exec_init; + fdt->open_fds = &newf->open_fds_init; + fdt->fd = &newf->fd_array[0]; + INIT_RCU_HEAD(&fdt->rcu); + fdt->free_files = NULL; + fdt->next = NULL; + rcu_assign_pointer(newf->fdt, fdt); +out: + return newf; +} + static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; int open_files, size, i, error = 0, expand; + struct fdtable *old_fdt, *new_fdt; /* * A background process may not have any files ... @@ -600,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) */ tsk->files = NULL; error = -ENOMEM; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); - if (!newf) + newf = alloc_files(); + if (!newf) goto out; - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - newf->max_fds = NR_OPEN_DEFAULT; - newf->max_fdset = __FD_SETSIZE; - newf->close_on_exec = &newf->close_on_exec_init; - newf->open_fds = &newf->open_fds_init; - newf->fd = &newf->fd_array[0]; - spin_lock(&oldf->file_lock); - - open_files = count_open_files(oldf, oldf->max_fdset); + old_fdt = files_fdtable(oldf); + new_fdt = files_fdtable(newf); + size = old_fdt->max_fdset; + open_files = count_open_files(old_fdt); expand = 0; /* * Check whether we need to allocate a larger fd array or fd set. * Note: we're not a clone task, so the open count won't change. */ - if (open_files > newf->max_fdset) { - newf->max_fdset = 0; + if (open_files > new_fdt->max_fdset) { + new_fdt->max_fdset = 0; expand = 1; } - if (open_files > newf->max_fds) { - newf->max_fds = 0; + if (open_files > new_fdt->max_fds) { + new_fdt->max_fds = 0; expand = 1; } @@ -640,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) spin_unlock(&newf->file_lock); if (error < 0) goto out_release; + new_fdt = files_fdtable(newf); + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. + */ spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); } - old_fds = oldf->fd; - new_fds = newf->fd; + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; - memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); - memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -660,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * is partway through open(). So make sure that this * fd is available to the new process. */ - FD_CLR(open_files - i, newf->open_fds); + FD_CLR(open_files - i, new_fdt->open_fds); } - *new_fds++ = f; + rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* compute the remainder to be cleared */ - size = (newf->max_fds - open_files) * sizeof(struct file *); + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (newf->max_fdset > open_files) { - int left = (newf->max_fdset-open_files)/8; + if (new_fdt->max_fdset > open_files) { + int left = (new_fdt->max_fdset-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); - memset(&newf->open_fds->fds_bits[start], 0, left); - memset(&newf->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds->fds_bits[start], 0, left); + memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } tsk->files = newf; @@ -686,9 +718,9 @@ out: return error; out_release: - free_fdset (newf->close_on_exec, newf->max_fdset); - free_fdset (newf->open_fds, newf->max_fdset); - free_fd_array(newf->fd, newf->max_fds); + free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); + free_fdset (new_fdt->open_fds, new_fdt->max_fdset); + free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); goto out; } @@ -992,6 +1024,9 @@ static task_t *copy_process(unsigned long clone_flags, * of CLONE_PTRACE. */ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#endif /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ @@ -1110,6 +1145,9 @@ static task_t *copy_process(unsigned long clone_flags, __get_cpu_var(process_counts)++; } + if (!current->signal->tty && p->signal->tty) + p->signal->tty = NULL; + nr_threads++; total_forks++; write_unlock_irq(&tasklist_lock); diff --git a/kernel/futex.c b/kernel/futex.c index c7130f86106c..ca05fe6a70b2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -40,6 +40,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/signal.h> +#include <asm/futex.h> #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) @@ -327,6 +328,118 @@ out: } /* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) +{ + union futex_key key1, key2; + struct futex_hash_bucket *bh1, *bh2; + struct list_head *head; + struct futex_q *this, *next; + int ret, op_ret, attempt = 0; + +retryfull: + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr1, &key1); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, &key2); + if (unlikely(ret != 0)) + goto out; + + bh1 = hash_futex(&key1); + bh2 = hash_futex(&key2); + +retry: + if (bh1 < bh2) + spin_lock(&bh1->lock); + spin_lock(&bh2->lock); + if (bh1 > bh2) + spin_lock(&bh1->lock); + + op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); + if (unlikely(op_ret < 0)) { + int dummy; + + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); + + /* futex_atomic_op_inuser needs to both read and write + * *(int __user *)uaddr2, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. */ + if (attempt++) { + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + ret = -EFAULT; + if (attempt >= 2 || + !(vma = find_vma(mm, uaddr2)) || + vma->vm_start > uaddr2 || + !(vma->vm_flags & VM_WRITE)) + goto out; + + switch (handle_mm_fault(mm, vma, uaddr2, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: + goto out; + } + goto retry; + } + + /* If we would have faulted, release mmap_sem, + * fault it in and start all over again. */ + up_read(¤t->mm->mmap_sem); + + ret = get_user(dummy, (int __user *)uaddr2); + if (ret) + return ret; + + goto retryfull; + } + + head = &bh1->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key1)) { + wake_futex(this); + if (++ret >= nr_wake) + break; + } + } + + if (op_ret > 0) { + head = &bh2->chain; + + op_ret = 0; + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key2)) { + wake_futex(this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } + + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); +out: + up_read(¤t->mm->mmap_sem); + return ret; +} + +/* * Requeue all waiters hashed on one physical page to another * physical page. */ @@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal) filp->f_mapping = filp->f_dentry->d_inode->i_mapping; if (signal) { - int err; err = f_setown(filp, current->pid, 1); if (err < 0) { - put_unused_fd(ret); - put_filp(filp); - ret = err; - goto out; + goto error; } filp->f_owner.signum = signal; } q = kmalloc(sizeof(*q), GFP_KERNEL); if (!q) { - put_unused_fd(ret); - put_filp(filp); - ret = -ENOMEM; - goto out; + err = -ENOMEM; + goto error; } down_read(¤t->mm->mmap_sem); @@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal) if (unlikely(err != 0)) { up_read(¤t->mm->mmap_sem); - put_unused_fd(ret); - put_filp(filp); kfree(q); - return err; + goto error; } /* @@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal) fd_install(ret, filp); out: return ret; +error: + put_unused_fd(ret); + put_filp(filp); + ret = err; + goto out; } long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, @@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, case FUTEX_CMP_REQUEUE: ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); break; + case FUTEX_WAKE_OP: + ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); + break; default: ret = -ENOSYS; } diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 388977f3e9b7..0cbe633420fb 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c @@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void struct list_head *tmp; struct inter_module_entry *ime, *ime_new; - if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { + if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { /* Overloaded kernel, not fatal */ printk(KERN_ERR "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", @@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void kmalloc_failed = 1; return; } - memset(ime_new, 0, sizeof(*ime_new)); ime_new->im_name = im_name; ime_new->owner = owner; ime_new->userdata = userdata; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c29f83c16497..3ff7b925c387 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) unsigned int status; kstat_this_cpu.irqs[irq]++; - if (desc->status & IRQ_PER_CPU) { + if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ac6700985705..1cfdb08ddf20 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -18,6 +18,10 @@ cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; +#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) +cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; +#endif + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 85d08daa6600..f26e534c6585 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; */ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; -void __attribute__((weak)) -proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +#ifdef CONFIG_GENERIC_PENDING_IRQ +void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +{ + /* + * Save these away for later use. Re-progam when the + * interrupt is pending + */ + set_pending_irq(irq, mask_val); +} +#else +void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { irq_affinity[irq] = mask_val; irq_desc[irq].handler->set_affinity(irq, mask_val); } +#endif static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) diff --git a/kernel/itimer.c b/kernel/itimer.c index a72cb0e5aa4b..7c1b25e25e47 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -112,28 +112,11 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) return error; } -/* - * Called with P->sighand->siglock held and P->signal->real_timer inactive. - * If interval is nonzero, arm the timer for interval ticks from now. - */ -static inline void it_real_arm(struct task_struct *p, unsigned long interval) -{ - p->signal->it_real_value = interval; /* XXX unnecessary field?? */ - if (interval == 0) - return; - if (interval > (unsigned long) LONG_MAX) - interval = LONG_MAX; - /* the "+ 1" below makes sure that the timer doesn't go off before - * the interval requested. This could happen if - * time requested % (usecs per jiffy) is more than the usecs left - * in the current jiffy */ - p->signal->real_timer.expires = jiffies + interval + 1; - add_timer(&p->signal->real_timer); -} void it_real_fn(unsigned long __data) { struct task_struct * p = (struct task_struct *) __data; + unsigned long inc = p->signal->it_real_incr; send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); @@ -141,14 +124,23 @@ void it_real_fn(unsigned long __data) * Now restart the timer if necessary. We don't need any locking * here because do_setitimer makes sure we have finished running * before it touches anything. + * Note, we KNOW we are (or should be) at a jiffie edge here so + * we don't need the +1 stuff. Also, we want to use the prior + * expire value so as to not "slip" a jiffie if we are late. + * Deal with requesting a time prior to "now" here rather than + * in add_timer. */ - it_real_arm(p, p->signal->it_real_incr); + if (!inc) + return; + while (time_before_eq(p->signal->real_timer.expires, jiffies)) + p->signal->real_timer.expires += inc; + add_timer(&p->signal->real_timer); } int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; - unsigned long val, interval; + unsigned long val, interval, expires; cputime_t cval, cinterval, nval, ninterval; switch (which) { @@ -164,7 +156,10 @@ again: } tsk->signal->it_real_incr = timeval_to_jiffies(&value->it_interval); - it_real_arm(tsk, timeval_to_jiffies(&value->it_value)); + expires = timeval_to_jiffies(&value->it_value); + if (expires) + mod_timer(&tsk->signal->real_timer, + jiffies + 1 + expires); spin_unlock_irq(&tsk->sighand->siglock); if (ovalue) { jiffies_to_timeval(val, &ovalue->it_value); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b0237122b24e..f3ea492ab44d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -37,6 +37,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> #include <asm/kdebug.h> @@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages; * get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t *get_insn_slot(void) +kprobe_opcode_t __kprobes *get_insn_slot(void) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void) return kip->insns; } -void free_insn_slot(kprobe_opcode_t *slot) +void __kprobes free_insn_slot(kprobe_opcode_t *slot) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot) } /* Locks kprobe: irqs must be disabled */ -void lock_kprobes(void) +void __kprobes lock_kprobes(void) { + unsigned long flags = 0; + + /* Avoiding local interrupts to happen right after we take the kprobe_lock + * and before we get a chance to update kprobe_cpu, this to prevent + * deadlock when we have a kprobe on ISR routine and a kprobe on task + * routine + */ + local_irq_save(flags); + spin_lock(&kprobe_lock); kprobe_cpu = smp_processor_id(); + + local_irq_restore(flags); } -void unlock_kprobes(void) +void __kprobes unlock_kprobes(void) { + unsigned long flags = 0; + + /* Avoiding local interrupts to happen right after we update + * kprobe_cpu and before we get a a chance to release kprobe_lock, + * this to prevent deadlock when we have a kprobe on ISR routine and + * a kprobe on task routine + */ + local_irq_save(flags); + kprobe_cpu = NR_CPUS; spin_unlock(&kprobe_lock); + + local_irq_restore(flags); } /* You have to be holding the kprobe_lock */ -struct kprobe *get_kprobe(void *addr) +struct kprobe __kprobes *get_kprobe(void *addr) { struct hlist_head *head; struct hlist_node *node; @@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr) * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ -static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) +static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) { struct kprobe *kp; @@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, return; } -static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) +static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) { /* * if we faulted "during" the execution of a user specified @@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, return 0; } -static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp = curr_kprobe; if (curr_kprobe && kp->break_handler) { @@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) +struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) { struct hlist_node *node; struct kretprobe_instance *ri; @@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) return NULL; } -static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) +static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe + *rp) { struct hlist_node *node; struct kretprobe_instance *ri; @@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) return NULL; } -void add_rp_inst(struct kretprobe_instance *ri) +void __kprobes add_rp_inst(struct kretprobe_instance *ri) { /* * Remove rp inst off the free list - @@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->used_instances); } -void recycle_rp_inst(struct kretprobe_instance *ri) +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) { /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); @@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri) kfree(ri); } -struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) +struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) { return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; } @@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) * instances associated with this task. These left over instances represent * probed functions that have been called but will never return. */ -void kprobe_flush_task(struct task_struct *tk) +void __kprobes kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head; @@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk) * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. */ -static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); @@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) * Add the new probe to old_p->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ -static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) +static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) { struct kprobe *kp; @@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) * the intricacies * TODO: Move kcalloc outside the spinlock */ -static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) +static int __kprobes register_aggr_kprobe(struct kprobe *old_p, + struct kprobe *p) { int ret = 0; struct kprobe *ap; @@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p, spin_unlock_irqrestore(&kprobe_lock, flags); } -int register_kprobe(struct kprobe *p) +static int __kprobes in_kprobes_functions(unsigned long addr) +{ + if (addr >= (unsigned long)__kprobes_text_start + && addr < (unsigned long)__kprobes_text_end) + return -EINVAL; + return 0; +} + +int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; unsigned long flags = 0; struct kprobe *old_p; - if ((ret = arch_prepare_kprobe(p)) != 0) { + if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) + return ret; + if ((ret = arch_prepare_kprobe(p)) != 0) goto rm_kprobe; - } + spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); p->nmissed = 0; @@ -466,7 +502,7 @@ rm_kprobe: return ret; } -void unregister_kprobe(struct kprobe *p) +void __kprobes unregister_kprobe(struct kprobe *p) { unsigned long flags; struct kprobe *old_p; @@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = { .priority = 0x7fffffff /* we need to notified first */ }; -int register_jprobe(struct jprobe *jp) +int __kprobes register_jprobe(struct jprobe *jp) { /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; @@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp) return register_kprobe(&jp->kp); } -void unregister_jprobe(struct jprobe *jp) +void __kprobes unregister_jprobe(struct jprobe *jp) { unregister_kprobe(&jp->kp); } #ifdef ARCH_SUPPORTS_KRETPROBES -int register_kretprobe(struct kretprobe *rp) +int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp) #else /* ARCH_SUPPORTS_KRETPROBES */ -int register_kretprobe(struct kretprobe *rp) +int __kprobes register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } #endif /* ARCH_SUPPORTS_KRETPROBES */ -void unregister_kretprobe(struct kretprobe *rp) +void __kprobes unregister_kretprobe(struct kretprobe *rp) { unsigned long flags; struct kretprobe_instance *ri; diff --git a/kernel/module.c b/kernel/module.c index 068e271ab3a5..ff5c500ab625 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/moduleloader.h> #include <linux/init.h> +#include <linux/kernel.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/elf.h> @@ -250,13 +251,18 @@ static inline unsigned int block_size(int val) /* Created by linker magic */ extern char __per_cpu_start[], __per_cpu_end[]; -static void *percpu_modalloc(unsigned long size, unsigned long align) +static void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) { unsigned long extra; unsigned int i; void *ptr; - BUG_ON(align > SMP_CACHE_BYTES); + if (align > SMP_CACHE_BYTES) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", + name, align, SMP_CACHE_BYTES); + align = SMP_CACHE_BYTES; + } ptr = __per_cpu_start; for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { @@ -348,7 +354,8 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); #else /* ... !CONFIG_SMP */ -static inline void *percpu_modalloc(unsigned long size, unsigned long align) +static inline void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) { return NULL; } @@ -492,7 +499,7 @@ static inline int try_force(unsigned int flags) { int ret = (flags & O_TRUNC); if (ret) - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); return ret; } #else @@ -891,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs, if (!(tainted & TAINT_FORCED_MODULE)) { printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); } return 1; } @@ -1346,7 +1353,7 @@ static void set_license(struct module *mod, const char *license) if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", mod->name, license); - tainted |= TAINT_PROPRIETARY_MODULE; + add_taint(TAINT_PROPRIETARY_MODULE); } } @@ -1503,6 +1510,7 @@ static struct module *load_module(void __user *umod, long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ struct exception_table_entry *extable; + mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", umod, len, uargs); @@ -1603,7 +1611,7 @@ static struct module *load_module(void __user *umod, modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1644,7 +1652,8 @@ static struct module *load_module(void __user *umod, if (pcpuindex) { /* We have a special allocation for this section. */ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign); + sechdrs[pcpuindex].sh_addralign, + mod->name); if (!percpu) { err = -ENOMEM; goto free_mod; @@ -1731,7 +1740,7 @@ static struct module *load_module(void __user *umod, (mod->num_gpl_syms && !gplcrcindex)) { printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); - tainted |= TAINT_FORCED_MODULE; + add_taint(TAINT_FORCED_MODULE); } #endif @@ -1772,6 +1781,24 @@ static struct module *load_module(void __user *umod, if (err < 0) goto cleanup; + /* flush the icache in correct context */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + + /* + * Flush the instruction cache, since we've played with text. + * Do it before processing of module parameters, so the module + * can provide parameter accessor functions of its own. + */ + if (mod->module_init) + flush_icache_range((unsigned long)mod->module_init, + (unsigned long)mod->module_init + + mod->init_size); + flush_icache_range((unsigned long)mod->module_core, + (unsigned long)mod->module_core + mod->core_size); + + set_fs(old_fs); + mod->args = args; if (obsparmindex) { err = obsolete_params(mod->name, mod->args, @@ -1853,7 +1880,6 @@ sys_init_module(void __user *umod, const char __user *uargs) { struct module *mod; - mm_segment_t old_fs = get_fs(); int ret = 0; /* Must have permission */ @@ -1871,19 +1897,6 @@ sys_init_module(void __user *umod, return PTR_ERR(mod); } - /* flush the icache in correct context */ - set_fs(KERNEL_DS); - - /* Flush the instruction cache, since we've played with text */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); - - set_fs(old_fs); - /* Now sew it into the lists. They won't access us, since strong_try_module_get() will fail. */ stop_machine_run(__link_module, mod, NR_CPUS); diff --git a/kernel/panic.c b/kernel/panic.c index 74ba5f3e46c7..aabc5f86fa3f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -111,12 +111,11 @@ NORET_TYPE void panic(const char * fmt, ...) mdelay(1); i++; } - /* - * Should we run the reboot notifier. For the moment Im - * choosing not too. It might crash, be corrupt or do - * more harm than good for other reasons. + /* This will not be a clean reboot, with everything + * shutting down. But if there is a chance of + * rebooting the system it will be rebooted. */ - machine_restart(NULL); + emergency_restart(); } #ifdef __sparc__ { diff --git a/kernel/params.c b/kernel/params.c index d586c35ef8fc..fbf173215fd2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name, { struct module_kobject *mk; - mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); - memset(mk, 0, sizeof(struct module_kobject)); + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + BUG_ON(!mk); mk->mod = THIS_MODULE; kobj_set_kset_s(mk, module_subsys); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5b7b4736d82b..b7b532acd9fc 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private) timr->sigq->info.si_code = SI_TIMER; timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; + if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - if (unlikely(timr->it_process->flags & PF_EXITING)) { - timr->it_sigev_notify = SIGEV_SIGNAL; - put_task_struct(timr->it_process); - timr->it_process = timr->it_process->group_leader; - goto group; - } - return send_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); - } - else { - group: - return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); + struct task_struct *leader; + int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); + + if (likely(ret >= 0)) + return ret; + + timr->it_sigev_notify = SIGEV_SIGNAL; + leader = timr->it_process->group_leader; + put_task_struct(timr->it_process); + timr->it_process = leader; } + + return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); } EXPORT_SYMBOL_GPL(posix_timer_event); @@ -896,21 +898,10 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, jiffies_64_f = get_jiffies_64(); } /* - * Take away now to get delta - */ - oc.tv_sec -= now.tv_sec; - oc.tv_nsec -= now.tv_nsec; - /* - * Normalize... + * Take away now to get delta and normalize */ - while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) { - oc.tv_nsec -= NSEC_PER_SEC; - oc.tv_sec++; - } - while ((oc.tv_nsec) < 0) { - oc.tv_nsec += NSEC_PER_SEC; - oc.tv_sec--; - } + set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec, + oc.tv_nsec - now.tv_nsec); }else{ jiffies_64_f = get_jiffies_64(); } @@ -1177,7 +1168,6 @@ void exit_itimers(struct signal_struct *sig) tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); itimer_delete(tmr); } - del_timer_sync(&sig->real_timer); } /* diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2c7121d9bff1..396c7873e804 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,5 +1,6 @@ config PM bool "Power Management support" + depends on !IA64_HP_SIM ---help--- "Power Management" means that parts of your computer are shut off or put into a power conserving "sleep" mode if they are not @@ -28,7 +29,7 @@ config PM_DEBUG config SOFTWARE_SUSPEND bool "Software Suspend" - depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) + depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP)) ---help--- Enable the possibility of suspending the machine. It doesn't need APM. @@ -72,6 +73,18 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config SWSUSP_ENCRYPT + bool "Encrypt suspend image" + depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y) + default "" + ---help--- + To prevent data gathering from swap after resume you can encrypt + the suspend image with a temporary key that is deleted on + resume. + + Note that the temporary key is stored unencrypted on disk while the + system is suspended. + config SUSPEND_SMP bool depends on HOTPLUG_CPU && X86 && PM diff --git a/kernel/power/disk.c b/kernel/power/disk.c index fb8de63c2919..2d8bf054d036 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -16,6 +16,8 @@ #include <linux/device.h> #include <linux/delay.h> #include <linux/fs.h> +#include <linux/mount.h> + #include "power.h" @@ -57,16 +59,13 @@ static void power_down(suspend_disk_method_t mode) error = pm_ops->enter(PM_SUSPEND_DISK); break; case PM_DISK_SHUTDOWN: - printk("Powering off system\n"); - device_shutdown(); - machine_power_off(); + kernel_power_off(); break; case PM_DISK_REBOOT: - device_shutdown(); - machine_restart(NULL); + kernel_restart(NULL); break; } - machine_halt(); + kernel_halt(); /* Valid image is on the disk, if we continue we risk serious data corruption after resume. */ printk(KERN_CRIT "Please power me down manually\n"); @@ -113,24 +112,12 @@ static inline void platform_finish(void) } } -static void finish(void) -{ - device_resume(); - platform_finish(); - thaw_processes(); - enable_nonboot_cpus(); - pm_restore_console(); -} - - static int prepare_processes(void) { int error; pm_prepare_console(); - sys_sync(); - disable_nonboot_cpus(); if (freeze_processes()) { @@ -163,15 +150,6 @@ static void unprepare_processes(void) pm_restore_console(); } -static int prepare_devices(void) -{ - int error; - - if ((error = device_suspend(PMSG_FREEZE))) - printk("Some devices failed to suspend\n"); - return error; -} - /** * pm_suspend_disk - The granpappy of power management. * @@ -188,17 +166,14 @@ int pm_suspend_disk(void) error = prepare_processes(); if (error) return error; - error = prepare_devices(); + error = device_suspend(PMSG_FREEZE); if (error) { + printk("Some devices failed to suspend\n"); unprepare_processes(); return error; } - pr_debug("PM: Attempting to suspend to disk.\n"); - if (pm_disk_mode == PM_DISK_FIRMWARE) - return pm_ops->enter(PM_SUSPEND_DISK); - pr_debug("PM: snapshotting memory.\n"); in_suspend = 1; if ((error = swsusp_suspend())) @@ -209,11 +184,20 @@ int pm_suspend_disk(void) error = swsusp_write(); if (!error) power_down(pm_disk_mode); + else { + /* swsusp_write can not fail in device_resume, + no need to do second device_resume */ + swsusp_free(); + unprepare_processes(); + return error; + } } else pr_debug("PM: Image restored successfully.\n"); + swsusp_free(); Done: - finish(); + device_resume(); + unprepare_processes(); return error; } @@ -234,11 +218,25 @@ static int software_resume(void) { int error; + down(&pm_sem); + if (!swsusp_resume_device) { + if (!strlen(resume_file)) { + up(&pm_sem); + return -ENOENT; + } + swsusp_resume_device = name_to_dev_t(resume_file); + pr_debug("swsusp: Resume From Partition %s\n", resume_file); + } else { + pr_debug("swsusp: Resume From Partition %d:%d\n", + MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); + } + if (noresume) { /** * FIXME: If noresume is specified, we need to find the partition * and reset it back to normal swap space. */ + up(&pm_sem); return 0; } @@ -261,20 +259,24 @@ static int software_resume(void) pr_debug("PM: Preparing devices for restore.\n"); - if ((error = prepare_devices())) + if ((error = device_suspend(PMSG_FREEZE))) { + printk("Some devices failed to suspend\n"); goto Free; + } mb(); pr_debug("PM: Restoring saved image.\n"); swsusp_resume(); pr_debug("PM: Restore failed, recovering.n"); - finish(); + device_resume(); Free: swsusp_free(); Cleanup: unprepare_processes(); Done: + /* For success case, the suspend path will release the lock */ + up(&pm_sem); pr_debug("PM: Resume from disk failed.\n"); return 0; } @@ -381,7 +383,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t if (sscanf(buf, "%u:%u", &maj, &min) == 2) { res = MKDEV(maj,min); if (maj == MAJOR(res) && min == MINOR(res)) { + down(&pm_sem); swsusp_resume_device = res; + up(&pm_sem); printk("Attempting manual resume\n"); noresume = 0; software_resume(); diff --git a/kernel/power/main.c b/kernel/power/main.c index c94cb9e95090..22bdc93cc038 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -19,6 +19,9 @@ #include "power.h" +/*This is just an arbitrary number */ +#define FREE_PAGE_NUMBER (100) + DECLARE_MUTEX(pm_sem); struct pm_ops * pm_ops = NULL; @@ -49,6 +52,7 @@ void pm_set_ops(struct pm_ops * ops) static int suspend_prepare(suspend_state_t state) { int error = 0; + unsigned int free_pages; if (!pm_ops || !pm_ops->enter) return -EPERM; @@ -67,6 +71,16 @@ static int suspend_prepare(suspend_state_t state) goto Thaw; } + if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) { + pr_debug("PM: free some memory\n"); + shrink_all_memory(FREE_PAGE_NUMBER - free_pages); + if (nr_free_pages() < FREE_PAGE_NUMBER) { + error = -ENOMEM; + printk(KERN_ERR "PM: No enough memory\n"); + goto Thaw; + } + } + if (pm_ops->prepare) { if ((error = pm_ops->prepare(state))) goto Thaw; @@ -129,11 +143,12 @@ static void suspend_finish(suspend_state_t state) -static char * pm_states[] = { +static char *pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", +#ifdef CONFIG_SOFTWARE_SUSPEND [PM_SUSPEND_DISK] = "disk", - NULL, +#endif }; @@ -194,7 +209,7 @@ int software_suspend(void) int pm_suspend(suspend_state_t state) { - if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) + if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) return enter_state(state); return -EINVAL; } diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 61deda04e39e..159149321b3c 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type, unsigned long id, pm_callback callback) { - struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); + struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); if (dev) { - memset(dev, 0, sizeof(*dev)); dev->type = type; dev->id = id; dev->callback = callback; diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 715081b2d829..7a4144ba3afd 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -9,6 +9,7 @@ #include <linux/init.h> #include <linux/pm.h> #include <linux/workqueue.h> +#include <linux/reboot.h> /* * When the user hits Sys-Rq o to power down the machine this is the @@ -17,8 +18,7 @@ static void do_poweroff(void *dummy) { - if (pm_power_off) - pm_power_off(); + kernel_power_off(); } static DECLARE_WORK(poweroff_work, do_poweroff, NULL); diff --git a/kernel/power/process.c b/kernel/power/process.c index 0a086640bcfc..28de118f7a0b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -38,7 +38,6 @@ void refrigerator(void) processes around? */ long save; save = current->state; - current->state = TASK_UNINTERRUPTIBLE; pr_debug("%s entered refrigerator\n", current->comm); printk("="); @@ -47,8 +46,10 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); - while (frozen(current)) + while (frozen(current)) { + current->state = TASK_UNINTERRUPTIBLE; schedule(); + } pr_debug("%s left refrigerator\n", current->comm); current->state = save; } @@ -59,6 +60,7 @@ int freeze_processes(void) int todo; unsigned long start_time; struct task_struct *g, *p; + unsigned long flags; printk( "Stopping tasks: " ); start_time = jiffies; @@ -66,12 +68,9 @@ int freeze_processes(void) todo = 0; read_lock(&tasklist_lock); do_each_thread(g, p) { - unsigned long flags; if (!freezeable(p)) continue; - if ((frozen(p)) || - (p->state == TASK_TRACED) || - (p->state == TASK_STOPPED)) + if (frozen(p)) continue; freeze(p); @@ -82,13 +81,33 @@ int freeze_processes(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ - if (time_after(jiffies, start_time + TIMEOUT)) { + if (todo && time_after(jiffies, start_time + TIMEOUT)) { printk( "\n" ); printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); - return todo; + break; } } while(todo); + /* This does not unfreeze processes that are already frozen + * (we have slightly ugly calling convention in that respect, + * and caller must call thaw_processes() if something fails), + * but it cleans up leftover PF_FREEZE requests. + */ + if (todo) { + read_lock(&tasklist_lock); + do_each_thread(g, p) + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + p->flags &= ~PF_FREEZE; + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_tsk(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } + while_each_thread(g, p); + read_unlock(&tasklist_lock); + return todo; + } + printk( "|\n" ); BUG_ON(in_atomic()); return 0; diff --git a/kernel/power/smp.c b/kernel/power/smp.c index bbe23079c62c..911fc62b8225 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c @@ -38,7 +38,7 @@ void disable_nonboot_cpus(void) } printk("Error taking cpu %d down: %d\n", cpu, error); } - BUG_ON(smp_processor_id() != 0); + BUG_ON(raw_smp_processor_id() != 0); if (error) panic("cpus not sleeping"); } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c285fc5a2320..d967e875ee82 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -31,6 +31,9 @@ * Alex Badea <vampire@go.ro>: * Fixed runaway init * + * Andreas Steinmetz <ast@domdv.de>: + * Added encrypted suspend option + * * More state savers are welcome. Especially for the scsi layer... * * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt @@ -63,6 +66,7 @@ #include <linux/console.h> #include <linux/highmem.h> #include <linux/bio.h> +#include <linux/mount.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -70,8 +74,16 @@ #include <asm/tlbflush.h> #include <asm/io.h> +#include <linux/random.h> +#include <linux/crypto.h> +#include <asm/scatterlist.h> + #include "power.h" +#define CIPHER "aes" +#define MAXKEY 32 +#define MAXIV 32 + /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; @@ -102,7 +114,8 @@ static suspend_pagedir_t *pagedir_save; #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; + u8 key_iv[MAXKEY+MAXIV]; swp_entry_t swsusp_info; char orig_sig[10]; char sig[10]; @@ -128,6 +141,131 @@ static struct swsusp_info swsusp_info; static unsigned short swapfile_used[MAX_SWAPFILES]; static unsigned short root_swap; +static int write_page(unsigned long addr, swp_entry_t * loc); +static int bio_read_page(pgoff_t page_off, void * page); + +static u8 key_iv[MAXKEY+MAXIV]; + +#ifdef CONFIG_SWSUSP_ENCRYPT + +static int crypto_init(int mode, void **mem) +{ + int error = 0; + int len; + char *modemsg; + struct crypto_tfm *tfm; + + modemsg = mode ? "suspend not possible" : "resume not possible"; + + tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); + if(!tfm) { + printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); + error = -EINVAL; + goto out; + } + + if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { + printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); + error = -ENOKEY; + goto fail; + } + + if (mode) + get_random_bytes(key_iv, MAXKEY+MAXIV); + + len = crypto_tfm_alg_max_keysize(tfm); + if (len > MAXKEY) + len = MAXKEY; + + if (crypto_cipher_setkey(tfm, key_iv, len)) { + printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); + error = -EKEYREJECTED; + goto fail; + } + + len = crypto_tfm_alg_ivsize(tfm); + + if (MAXIV < len) { + printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); + error = -EOVERFLOW; + goto fail; + } + + crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); + + *mem=(void *)tfm; + + goto out; + +fail: crypto_free_tfm(tfm); +out: return error; +} + +static __inline__ void crypto_exit(void *mem) +{ + crypto_free_tfm((struct crypto_tfm *)mem); +} + +static __inline__ int crypto_write(struct pbe *p, void *mem) +{ + int error = 0; + struct scatterlist src, dst; + + src.page = virt_to_page(p->address); + src.offset = 0; + src.length = PAGE_SIZE; + dst.page = virt_to_page((void *)&swsusp_header); + dst.offset = 0; + dst.length = PAGE_SIZE; + + error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, + PAGE_SIZE); + + if (!error) + error = write_page((unsigned long)&swsusp_header, + &(p->swap_address)); + return error; +} + +static __inline__ int crypto_read(struct pbe *p, void *mem) +{ + int error = 0; + struct scatterlist src, dst; + + error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); + if (!error) { + src.offset = 0; + src.length = PAGE_SIZE; + dst.offset = 0; + dst.length = PAGE_SIZE; + src.page = dst.page = virt_to_page((void *)p->address); + + error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, + &src, PAGE_SIZE); + } + return error; +} +#else +static __inline__ int crypto_init(int mode, void *mem) +{ + return 0; +} + +static __inline__ void crypto_exit(void *mem) +{ +} + +static __inline__ int crypto_write(struct pbe *p, void *mem) +{ + return write_page(p->address, &(p->swap_address)); +} + +static __inline__ int crypto_read(struct pbe *p, void *mem) +{ + return bio_read_page(swp_offset(p->swap_address), (void *)p->address); +} +#endif + static int mark_swapfiles(swp_entry_t prev) { int error; @@ -139,6 +277,7 @@ static int mark_swapfiles(swp_entry_t prev) !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); + memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); swsusp_header.swsusp_info = prev; error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), @@ -178,9 +317,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */ len=strlen(resume_file); root_swap = 0xFFFF; - swap_list_lock(); + spin_lock(&swap_lock); for (i=0; i<MAX_SWAPFILES; i++) { - if (swap_info[i].flags == 0) { + if (!(swap_info[i].flags & SWP_WRITEOK)) { swapfile_used[i]=SWAPFILE_UNUSED; } else { if (!len) { @@ -201,7 +340,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ } } } - swap_list_unlock(); + spin_unlock(&swap_lock); return (root_swap != 0xffff) ? 0 : -ENODEV; } @@ -215,12 +354,12 @@ static void lock_swapdevices(void) { int i; - swap_list_lock(); + spin_lock(&swap_lock); for (i = 0; i< MAX_SWAPFILES; i++) if (swapfile_used[i] == SWAPFILE_IGNORED) { - swap_info[i].flags ^= 0xFF; + swap_info[i].flags ^= SWP_WRITEOK; } - swap_list_unlock(); + spin_unlock(&swap_lock); } /** @@ -285,6 +424,10 @@ static int data_write(void) int error = 0, i = 0; unsigned int mod = nr_copy_pages / 100; struct pbe *p; + void *tfm; + + if ((error = crypto_init(1, &tfm))) + return error; if (!mod) mod = 1; @@ -293,11 +436,14 @@ static int data_write(void) for_each_pbe (p, pagedir_nosave) { if (!(i%mod)) printk( "\b\b\b\b%3d%%", i / mod ); - if ((error = write_page(p->address, &(p->swap_address)))) + if ((error = crypto_write(p, tfm))) { + crypto_exit(tfm); return error; + } i++; } printk("\b\b\b\bdone\n"); + crypto_exit(tfm); return error; } @@ -384,7 +530,6 @@ static int write_pagedir(void) * write_suspend_image - Write entire image and metadata. * */ - static int write_suspend_image(void) { int error; @@ -399,6 +544,7 @@ static int write_suspend_image(void) if ((error = close_swap())) goto FreePagedir; Done: + memset(key_iv, 0, MAXKEY+MAXIV); return error; FreePagedir: free_pagedir_entries(); @@ -590,18 +736,7 @@ static void copy_data_pages(void) static int calc_nr(int nr_copy) { - int extra = 0; - int mod = !!(nr_copy % PBES_PER_PAGE); - int diff = (nr_copy / PBES_PER_PAGE) + mod; - - do { - extra += diff; - nr_copy += diff; - mod = !!(nr_copy % PBES_PER_PAGE); - diff = (nr_copy / PBES_PER_PAGE) + mod - extra; - } while (diff > 0); - - return nr_copy; + return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1); } /** @@ -869,13 +1004,6 @@ extern asmlinkage int swsusp_arch_resume(void); asmlinkage int swsusp_save(void) { - int error = 0; - - if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " - "swapon -a!\n"); - return error; - } return suspend_prepare_image(); } @@ -894,12 +1022,19 @@ int swsusp_suspend(void) if ((error = device_power_down(PMSG_FREEZE))) { printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); local_irq_enable(); - swsusp_free(); return error; } + + if ((error = swsusp_swap_check())) { + printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); + device_power_up(); + local_irq_enable(); + return error; + } + save_processor_state(); if ((error = swsusp_arch_suspend())) - swsusp_free(); + printk(KERN_ERR "Error %d suspending\n", error); /* Restore control flow magically appears here */ restore_processor_state(); BUG_ON (nr_copy_pages_check != nr_copy_pages); @@ -924,6 +1059,7 @@ int swsusp_resume(void) BUG_ON(!error); restore_processor_state(); restore_highmem(); + touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); return error; @@ -1166,9 +1302,9 @@ static int bio_write_page(pgoff_t page_off, void * page) static const char * sanity_check(void) { dump_info(); - if(swsusp_info.version_code != LINUX_VERSION_CODE) + if (swsusp_info.version_code != LINUX_VERSION_CODE) return "kernel version"; - if(swsusp_info.num_physpages != num_physpages) + if (swsusp_info.num_physpages != num_physpages) return "memory size"; if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) return "system type"; @@ -1179,7 +1315,8 @@ static const char * sanity_check(void) if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) return "machine"; #if 0 - if(swsusp_info.cpus != num_online_cpus()) + /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ + if (swsusp_info.cpus != num_possible_cpus()) return "number of cpus"; #endif return NULL; @@ -1212,13 +1349,14 @@ static int check_sig(void) return error; if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); + memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); /* * Reset swap signature now. */ error = bio_write_page(0, &swsusp_header); } else { - printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n"); return -EINVAL; } if (!error) @@ -1239,6 +1377,10 @@ static int data_read(struct pbe *pblist) int error = 0; int i = 0; int mod = swsusp_info.image_pages / 100; + void *tfm; + + if ((error = crypto_init(0, &tfm))) + return error; if (!mod) mod = 1; @@ -1250,19 +1392,18 @@ static int data_read(struct pbe *pblist) if (!(i % mod)) printk("\b\b\b\b%3d%%", i / mod); - error = bio_read_page(swp_offset(p->swap_address), - (void *)p->address); - if (error) + if ((error = crypto_read(p, tfm))) { + crypto_exit(tfm); return error; + } i++; } printk("\b\b\b\bdone\n"); + crypto_exit(tfm); return error; } -extern dev_t name_to_dev_t(const char *line); - /** * read_pagedir - Read page backup list pages from swap */ @@ -1356,16 +1497,6 @@ int swsusp_check(void) { int error; - if (!swsusp_resume_device) { - if (!strlen(resume_file)) - return -ENOENT; - swsusp_resume_device = name_to_dev_t(resume_file); - pr_debug("swsusp: Resume From Partition %s\n", resume_file); - } else { - pr_debug("swsusp: Resume From Partition %d:%d\n", - MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); - } - resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); @@ -1397,6 +1528,7 @@ int swsusp_read(void) error = read_suspend_image(); blkdev_put(resume_bdev); + memset(key_iv, 0, MAXKEY+MAXIV); if (!error) pr_debug("swsusp: Reading resume file was successful\n"); diff --git a/kernel/printk.c b/kernel/printk.c index 5092397fac29..a967605bc2e3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...) return r; } +/* cpu currently holding logbuf_lock */ +static volatile unsigned int printk_cpu = UINT_MAX; + asmlinkage int vprintk(const char *fmt, va_list args) { unsigned long flags; @@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args) static char printk_buf[1024]; static int log_level_unknown = 1; - if (unlikely(oops_in_progress)) + preempt_disable(); + if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) + /* If a crash is occurring during printk() on this CPU, + * make sure we can't deadlock */ zap_locks(); /* This stops the holder of console_sem just where we want him */ spin_lock_irqsave(&logbuf_lock, flags); + printk_cpu = smp_processor_id(); /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); @@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * CPU until it is officially up. We shouldn't be calling into * random console drivers on a CPU which doesn't exist yet.. */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); goto out; } @@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * We own the drivers. We can drop the spinlock and let * release_console_sem() print the text */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); console_may_schedule = 0; release_console_sem(); @@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args) * allows the semaphore holder to proceed and to call the * console drivers with the output which we just produced. */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); } out: + preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); diff --git a/kernel/profile.c b/kernel/profile.c index ad8cbb75ffa2..f89248e6d704 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -35,11 +35,11 @@ struct profile_hit { #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) /* Oprofile timer tick hook */ -int (*timer_hook)(struct pt_regs *); +int (*timer_hook)(struct pt_regs *) __read_mostly; static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; -static int prof_on; +static int prof_on __read_mostly; static cpumask_t prof_cpu_mask = CPU_MASK_ALL; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8dcb8f6288bc..019e04ec065a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill) return ret; } +static int may_attach(struct task_struct *task) +{ + if (!task->mm) + return -EPERM; + if (((current->uid != task->euid) || + (current->uid != task->suid) || + (current->uid != task->uid) || + (current->gid != task->egid) || + (current->gid != task->sgid) || + (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) + return -EPERM; + smp_rmb(); + if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + return -EPERM; + + return security_ptrace(current, task); +} + +int ptrace_may_attach(struct task_struct *task) +{ + int err; + task_lock(task); + err = may_attach(task); + task_unlock(task); + return !err; +} + int ptrace_attach(struct task_struct *task) { int retval; @@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task) goto bad; if (task == current) goto bad; - if (!task->mm) - goto bad; - if(((current->uid != task->euid) || - (current->uid != task->suid) || - (current->uid != task->uid) || - (current->gid != task->egid) || - (current->gid != task->sgid) || - (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) - goto bad; - smp_rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) - goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; - retval = security_ptrace(current, task); + retval = may_attach(task); if (retval) goto bad; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f436993bd590..bef3b6901b76 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/rcupdate.h> +#include <linux/rcuref.h> #include <linux/cpu.h> /* Definition for rcupdate control block. */ @@ -72,6 +73,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; static int maxbatch = 10; +#ifndef __HAVE_ARCH_CMPXCHG +/* + * We use an array of spinlocks for the rcurefs -- similar to ones in sparc + * 32 bit atomic_t implementations, and a hash function similar to that + * for our refcounting needs. + * Can't help multiprocessors which donot have cmpxchg :( + */ + +spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { + [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED +}; +#endif + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. diff --git a/kernel/resource.c b/kernel/resource.c index 26967e042201..92285d822de6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource); */ struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) { - struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); if (res) { - memset(res, 0, sizeof(*res)); res->name = name; res->start = start; res->end = start + n - 1; diff --git a/kernel/sched.c b/kernel/sched.c index 5f2182d42241..81b3a96ed2d0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ -void wait_task_inactive(task_t * p) +void wait_task_inactive(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) int local_group; int i; + /* Skip over this group if it has no CPUs allowed */ + if (!cpus_intersects(group->cpumask, p->cpus_allowed)) + goto nextgroup; + local_group = cpu_isset(this_cpu, group->cpumask); - /* XXX: put a cpus allowed check */ /* Tally up the load of all CPUs in the group */ avg_load = 0; @@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) min_load = avg_load; idlest = group; } +nextgroup: group = group->next; } while (group != sd->groups); @@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) /* * find_idlest_queue - find the idlest runqueue among the cpus in group. */ -static int find_idlest_cpu(struct sched_group *group, int this_cpu) +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { + cpumask_t tmp; unsigned long load, min_load = ULONG_MAX; int idlest = -1; int i; - for_each_cpu_mask(i, group->cpumask) { + /* Traverse only the allowed CPUs */ + cpus_and(tmp, group->cpumask, p->cpus_allowed); + + for_each_cpu_mask(i, tmp) { load = source_load(i, 0); if (load < min_load || (load == min_load && i == this_cpu)) { @@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag) if (!group) goto nextlevel; - new_cpu = find_idlest_cpu(group, cpu); + new_cpu = find_idlest_cpu(group, t, cpu); if (new_cpu == -1 || new_cpu == cpu) goto nextlevel; @@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p) * * returns failure only if the task is already active. */ -static int try_to_wake_up(task_t * p, unsigned int state, int sync) +static int try_to_wake_up(task_t *p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; @@ -1252,6 +1261,16 @@ out_activate: } /* + * Tasks that have marked their sleep as noninteractive get + * woken up without updating their sleep average. (i.e. their + * sleep is handled in a priority-neutral manner, no priority + * boost and no penalty.) + */ + if (old_state & TASK_NONINTERACTIVE) + __activate_task(p, rq); + else + activate_task(p, rq, cpu == this_cpu); + /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on @@ -1259,7 +1278,6 @@ out_activate: * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ - activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -1274,7 +1292,7 @@ out: return success; } -int fastcall wake_up_process(task_t * p) +int fastcall wake_up_process(task_t *p) { return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); @@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) +void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) { unsigned long flags; int this_cpu, cpu; @@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) * artificially, because any timeslice recovered here * was given away by the parent in the first place.) */ -void fastcall sched_exit(task_t * p) +void fastcall sched_exit(task_t *p) { unsigned long flags; runqueue_t *rq; @@ -1478,6 +1496,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) /** * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -1510,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) * Manfred Spraul <manfred@colorfullife.com> */ prev_task_flags = prev->flags; +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif finish_arch_switch(prev); finish_lock_switch(rq, prev); if (mm) @@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle, int *all_pinned) + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { /* * We do not migrate tasks that are: @@ -1882,10 +1906,11 @@ out: */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle) + unsigned long *imbalance, enum idle_type idle, int *sd_idle) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; + unsigned long max_pull; int load_idx; max_load = this_load = total_load = total_pwr = 0; @@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, avg_load = 0; for_each_cpu_mask(i, group->cpumask) { + if (*sd_idle && !idle_cpu(i)) + *sd_idle = 0; + /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i, load_idx); @@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load) + if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; @@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); + /* How much load to actually move to equalise the imbalance */ - *imbalance = min((max_load - avg_load) * busiest->cpu_power, + *imbalance = min(max_pull * busiest->cpu_power, (avg_load - this_load) * this->cpu_power) / SCHED_LOAD_SCALE; @@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, unsigned long imbalance; int nr_moved, all_pinned = 0; int active_balance = 0; + int sd_idle = 0; + + if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; - spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); - group = find_busiest_group(sd, this_cpu, &imbalance, idle); + group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ - double_lock_balance(this_rq, busiest); + double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, - &all_pinned); - spin_unlock(&busiest->lock); + imbalance, sd, idle, &all_pinned); + double_rq_unlock(this_rq, busiest); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) goto out_balanced; } - spin_unlock(&this_rq->lock); - if (!nr_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; @@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { spin_lock(&busiest->lock); + + /* don't kick the migration_thread, if the curr + * task on busiest cpu can't be moved to this_cpu + */ + if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { + spin_unlock(&busiest->lock); + all_pinned = 1; + goto out_one_pinned; + } + if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; @@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->balance_interval *= 2; } + if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; return nr_moved; out_balanced: - spin_unlock(&this_rq->lock); - schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; + +out_one_pinned: /* tune up the balancing interval */ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; return 0; } @@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, runqueue_t *busiest = NULL; unsigned long imbalance; int nr_moved = 0; + int sd_idle = 0; + + if (sd->flags & SD_SHARE_CPUPOWER) + sd_idle = 1; schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); + group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, BUG_ON(busiest == this_rq); - /* Attempt to move tasks */ - double_lock_balance(this_rq, busiest); - schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); - nr_moved = move_tasks(this_rq, this_cpu, busiest, + + nr_moved = 0; + if (busiest->nr_running > 1) { + /* Attempt to move tasks */ + double_lock_balance(this_rq, busiest); + nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, NEWLY_IDLE, NULL); - if (!nr_moved) + spin_unlock(&busiest->lock); + } + + if (!nr_moved) { schedstat_inc(sd, lb_failed[NEWLY_IDLE]); - else + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; + } else sd->nr_balance_failed = 0; - spin_unlock(&busiest->lock); return nr_moved; out_balanced: schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) + return -1; sd->nr_balance_failed = 0; return 0; } @@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, if (j - sd->last_balance >= interval) { if (load_balance(this_cpu, this_rq, sd, idle)) { - /* We've pulled tasks over so no longer idle */ + /* + * We've pulled tasks over so either we're no + * longer idle, or one of our SMT siblings is + * not idle. + */ idle = NOT_IDLE; } sd->last_balance += interval; @@ -2575,6 +2637,13 @@ out: } #ifdef CONFIG_SCHED_SMT +static inline void wakeup_busy_runqueue(runqueue_t *rq) +{ + /* If an SMT runqueue is sleeping due to priority reasons wake it up */ + if (rq->curr == rq->idle && rq->nr_running) + resched_task(rq->idle); +} + static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq = cpu_rq(i); - /* - * If an SMT sibling task is sleeping due to priority - * reasons wake it up now. - */ - if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) - resched_task(smt_rq->idle); + wakeup_busy_runqueue(smt_rq); } for_each_cpu_mask(i, sibling_map) @@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) */ } +/* + * number of 'lost' timeslices this task wont be able to fully + * utilize, if another task runs on a sibling. This models the + * slowdown effect of other tasks running on siblings: + */ +static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) +{ + return p->time_slice * (100 - sd->per_cpu_gain) / 100; +} + static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { struct sched_domain *tmp, *sd = NULL; @@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) runqueue_t *smt_rq = cpu_rq(i); task_t *smt_curr = smt_rq->curr; + /* Kernel threads do not participate in dependent sleeping */ + if (!p->mm || !smt_curr->mm || rt_task(p)) + goto check_smt_task; + /* * If a user task with lower static priority than the * running task on the SMT sibling is trying to schedule, @@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) * task from using an unfair proportion of the * physical cpu's resources. -ck */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && - p->mm && smt_curr->mm && !rt_task(p)) - ret = 1; + if (rt_task(smt_curr)) { + /* + * With real time tasks we run non-rt tasks only + * per_cpu_gain% of the time. + */ + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + ret = 1; + } else + if (smt_curr->static_prio < p->static_prio && + !TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(smt_curr, sd) > task_timeslice(p)) + ret = 1; + +check_smt_task: + if ((!smt_curr->mm && smt_curr != smt_rq->idle) || + rt_task(smt_curr)) + continue; + if (!p->mm) { + wakeup_busy_runqueue(smt_rq); + continue; + } /* - * Reschedule a lower priority task on the SMT sibling, - * or wake it up if it has been put to sleep for priority - * reasons. + * Reschedule a lower priority task on the SMT sibling for + * it to be put to sleep, or wake it up if it has been put to + * sleep for priority reasons to see if it should run now. */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && - smt_curr->mm && p->mm && !rt_task(smt_curr)) || - (smt_curr == smt_rq->idle && smt_rq->nr_running)) - resched_task(smt_curr); + if (rt_task(p)) { + if ((jiffies % DEF_TIMESLICE) > + (sd->per_cpu_gain * DEF_TIMESLICE / 100)) + resched_task(smt_curr); + } else { + if (TASK_PREEMPTS_CURR(p, smt_rq) && + smt_slice(p, sd) > task_timeslice(smt_curr)) + resched_task(smt_curr); + else + wakeup_busy_runqueue(smt_rq); + } } out_unlock: for_each_cpu_mask(i, sibling_map) @@ -2887,6 +2989,7 @@ switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); prefetch(next); + prefetch_stack(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); @@ -3014,7 +3117,8 @@ need_resched: #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) +int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, + void *key) { task_t *p = curr->private; return try_to_wake_up(p, mode, sync); @@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @key: is directly passed to the wakeup function */ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) + int nr_exclusive, void *key) { unsigned long flags; @@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) * * On UP it can prevent extra preemption. */ -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +void fastcall +__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; int sync = 1; @@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -3378,8 +3484,8 @@ EXPORT_SYMBOL(set_user_nice); */ int can_nice(const task_t *p, const int nice) { - /* convert nice value [19,-20] to rlimit style value [0,39] */ - int nice_rlim = 19 - nice; + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = 20 - nice; return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || capable(CAP_SYS_NICE)); } @@ -3486,7 +3592,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) - p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + p->prio = MAX_RT_PRIO-1 - p->rt_priority; else p->prio = p->static_prio; } @@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) * @policy: new policy. * @param: structure containing the new RT priority. */ -int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) +int sched_setscheduler(struct task_struct *p, int policy, + struct sched_param *param) { int retval; int oldprio, oldpolicy = -1; @@ -3518,7 +3625,8 @@ recheck: * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ if (param->sched_priority < 0 || - param->sched_priority > MAX_USER_RT_PRIO-1) + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) return -EINVAL; @@ -3528,7 +3636,8 @@ recheck: */ if (!capable(CAP_SYS_NICE)) { /* can't change policy */ - if (policy != p->policy) + if (policy != p->policy && + !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) return -EPERM; /* can't increase priority */ if (policy != SCHED_NORMAL && @@ -3579,7 +3688,8 @@ recheck: } EXPORT_SYMBOL_GPL(sched_setscheduler); -static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { int retval; struct sched_param lparam; @@ -3846,7 +3956,7 @@ asmlinkage long sys_sched_yield(void) if (rt_task(current)) target = rq->active; - if (current->array->nr_active == 1) { + if (array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); if (!rq->expired->nr_active) schedstat_inc(rq, yld_both_empty); @@ -3877,6 +3987,13 @@ asmlinkage long sys_sched_yield(void) static inline void __cond_resched(void) { + /* + * The BKS might be reacquired before we have dropped + * PREEMPT_ACTIVE, which could trigger a second + * cond_resched() call. + */ + if (unlikely(preempt_count())) + return; do { add_preempt_count(PREEMPT_ACTIVE); schedule(); @@ -3903,7 +4020,7 @@ EXPORT_SYMBOL(cond_resched); * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t * lock) +int cond_resched_lock(spinlock_t *lock) { int ret = 0; @@ -4086,7 +4203,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p) return list_entry(p->sibling.next,struct task_struct,sibling); } -static void show_task(task_t * p) +static void show_task(task_t *p) { task_t *relative; unsigned state; @@ -4112,7 +4229,7 @@ static void show_task(task_t * p) #endif #ifdef CONFIG_DEBUG_STACK_USAGE { - unsigned long * n = (unsigned long *) (p->thread_info+1); + unsigned long *n = (unsigned long *) (p->thread_info+1); while (!*n) n++; free = (unsigned long) n - (unsigned long)(p->thread_info+1); @@ -4321,7 +4438,7 @@ out: * thread migration by bumping thread off CPU then 'pushing' onto * another runqueue. */ -static int migration_thread(void * data) +static int migration_thread(void *data) { runqueue_t *rq; int cpu = (long)data; @@ -4770,7 +4887,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void cpu_attach_domain(struct sched_domain *sd, int cpu) +static void cpu_attach_domain(struct sched_domain *sd, int cpu) { runqueue_t *rq = cpu_rq(cpu); struct sched_domain *tmp; @@ -4793,7 +4910,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -4821,8 +4938,8 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -void init_sched_build_groups(struct sched_group groups[], - cpumask_t span, int (*group_fn)(int cpu)) +static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, + int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; @@ -4855,12 +4972,85 @@ void init_sched_build_groups(struct sched_group groups[], last->next = first; } +#define SD_NODES_PER_DOMAIN 16 -#ifdef ARCH_HAS_SCHED_DOMAIN -extern void build_sched_domains(const cpumask_t *cpu_map); -extern void arch_init_sched_domains(const cpumask_t *cpu_map); -extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); -#else +#ifdef CONFIG_NUMA +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Start at @node */ + n = (node + i) % MAX_NUMNODES; + + if (!nr_cpus_node(n)) + continue; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static cpumask_t sched_domain_node_span(int node) +{ + int i; + cpumask_t span, nodemask; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + nodemask = node_to_cpumask(node); + cpus_or(span, span, nodemask); + set_bit(node, used_nodes); + + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, used_nodes); + nodemask = node_to_cpumask(next_node); + cpus_or(span, span, nodemask); + } + + return span; +} +#endif + +/* + * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we + * can switch it on easily if needed. + */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; @@ -4882,36 +5072,20 @@ static int cpu_to_phys_group(int cpu) } #ifdef CONFIG_NUMA - -static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static int cpu_to_node_group(int cpu) -{ - return cpu_to_node(cpu); -} -#endif - -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) /* - * The domains setup code relies on siblings not spanning - * multiple nodes. Make sure the architecture has a proper - * siblings map: + * The init_sched_build_groups can't handle what we want to do with node + * groups, so roll our own. Now each node has its own list of groups which + * gets dynamically allocated. */ -static void check_sibling_maps(void) -{ - int i, j; +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; - for_each_online_cpu(i) { - for_each_cpu_mask(j, cpu_sibling_map[i]) { - if (cpu_to_node(i) != cpu_to_node(j)) { - printk(KERN_INFO "warning: CPU %d siblings map " - "to different node - isolating " - "them.\n", i); - cpu_sibling_map[i] = cpumask_of_cpu(i); - break; - } - } - } +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); +static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; + +static int cpu_to_allnodes_group(int cpu) +{ + return cpu_to_node(cpu); } #endif @@ -4919,9 +5093,24 @@ static void check_sibling_maps(void) * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static void build_sched_domains(const cpumask_t *cpu_map) +void build_sched_domains(const cpumask_t *cpu_map) { int i; +#ifdef CONFIG_NUMA + struct sched_group **sched_group_nodes = NULL; + struct sched_group *sched_group_allnodes = NULL; + + /* + * Allocate the per-node list of sched groups + */ + sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + GFP_ATOMIC); + if (!sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + return; + } + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif /* * Set up domains for cpus specified by the cpu_map. @@ -4934,11 +5123,35 @@ static void build_sched_domains(const cpumask_t *cpu_map) cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA + if (cpus_weight(*cpu_map) + > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + if (!sched_group_allnodes) { + sched_group_allnodes + = kmalloc(sizeof(struct sched_group) + * MAX_NUMNODES, + GFP_KERNEL); + if (!sched_group_allnodes) { + printk(KERN_WARNING + "Can not alloc allnodes sched group\n"); + break; + } + sched_group_allnodes_bycpu[i] + = sched_group_allnodes; + } + sd = &per_cpu(allnodes_domains, i); + *sd = SD_ALLNODES_INIT; + sd->span = *cpu_map; + group = cpu_to_allnodes_group(i); + sd->groups = &sched_group_allnodes[group]; + p = sd; + } else + p = NULL; + sd = &per_cpu(node_domains, i); - group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - sd->span = *cpu_map; - sd->groups = &sched_group_nodes[group]; + sd->span = sched_domain_node_span(cpu_to_node(i)); + sd->parent = p; + cpus_and(sd->span, sd->span, *cpu_map); #endif p = sd; @@ -4963,7 +5176,7 @@ static void build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_online_cpu(i) { + for_each_cpu_mask(i, *cpu_map) { cpumask_t this_sibling_map = cpu_sibling_map[i]; cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) @@ -4988,8 +5201,77 @@ static void build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA /* Set up node groups */ - init_sched_build_groups(sched_group_nodes, *cpu_map, - &cpu_to_node_group); + if (sched_group_allnodes) + init_sched_build_groups(sched_group_allnodes, *cpu_map, + &cpu_to_allnodes_group); + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Set up node groups */ + struct sched_group *sg, *prev; + cpumask_t nodemask = node_to_cpumask(i); + cpumask_t domainspan; + cpumask_t covered = CPU_MASK_NONE; + int j; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) { + sched_group_nodes[i] = NULL; + continue; + } + + domainspan = sched_domain_node_span(i); + cpus_and(domainspan, domainspan, *cpu_map); + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sched_group_nodes[i] = sg; + for_each_cpu_mask(j, nodemask) { + struct sched_domain *sd; + sd = &per_cpu(node_domains, j); + sd->groups = sg; + if (sd->groups == NULL) { + /* Turn off balancing if we have no groups */ + sd->flags = 0; + } + } + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", i); + continue; + } + sg->cpu_power = 0; + sg->cpumask = nodemask; + cpus_or(covered, covered, nodemask); + prev = sg; + + for (j = 0; j < MAX_NUMNODES; j++) { + cpumask_t tmp, notcovered; + int n = (i + j) % MAX_NUMNODES; + + cpus_complement(notcovered, covered); + cpus_and(tmp, notcovered, *cpu_map); + cpus_and(tmp, tmp, domainspan); + if (cpus_empty(tmp)) + break; + + nodemask = node_to_cpumask(n); + cpus_and(tmp, tmp, nodemask); + if (cpus_empty(tmp)) + continue; + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + break; + } + sg->cpu_power = 0; + sg->cpumask = tmp; + cpus_or(covered, covered, tmp); + prev->next = sg; + prev = sg; + } + prev->next = sched_group_nodes[i]; + } #endif /* Calculate CPU power for physical packages and nodes */ @@ -5008,14 +5290,46 @@ static void build_sched_domains(const cpumask_t *cpu_map) sd->groups->cpu_power = power; #ifdef CONFIG_NUMA - if (i == first_cpu(sd->groups->cpumask)) { - /* Only add "power" once for each physical package. */ - sd = &per_cpu(node_domains, i); - sd->groups->cpu_power += power; + sd = &per_cpu(allnodes_domains, i); + if (sd->groups) { + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; } #endif } +#ifdef CONFIG_NUMA + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *sg = sched_group_nodes[i]; + int j; + + if (sg == NULL) + continue; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + int power; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + + sg->cpu_power += power; + } + sg = sg->next; + if (sg != sched_group_nodes[i]) + goto next_sg; + } +#endif + /* Attach the domains */ for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd; @@ -5030,13 +5344,10 @@ static void build_sched_domains(const cpumask_t *cpu_map) /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ -static void arch_init_sched_domains(cpumask_t *cpu_map) +static void arch_init_sched_domains(const cpumask_t *cpu_map) { cpumask_t cpu_default_map; -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) - check_sibling_maps(); -#endif /* * Setup mask for cpus without special case scheduling requirements. * For now this just excludes isolated cpus, but could be used to @@ -5049,10 +5360,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map) static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { - /* Do nothing: everything is statically allocated. */ -} +#ifdef CONFIG_NUMA + int i; + int cpu; -#endif /* ARCH_HAS_SCHED_DOMAIN */ + for_each_cpu_mask(cpu, *cpu_map) { + struct sched_group *sched_group_allnodes + = sched_group_allnodes_bycpu[cpu]; + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (sched_group_allnodes) { + kfree(sched_group_allnodes); + sched_group_allnodes_bycpu[cpu] = NULL; + } + + if (!sched_group_nodes) + continue; + + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + struct sched_group *oldsg, *sg = sched_group_nodes[i]; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +#endif +} /* * Detach sched domains from a group of cpus specified in cpu_map @@ -5254,3 +5602,47 @@ void normalize_rt_tasks(void) } #endif /* CONFIG_MAGIC_SYSRQ */ + +#ifdef CONFIG_IA64 +/* + * These functions are only useful for the IA64 MCA handling. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +task_t *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, task_t *p) +{ + cpu_curr(cpu) = p; +} + +#endif diff --git a/kernel/signal.c b/kernel/signal.c index ca1186eef938..b92c3c9f8b9a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info, /* forward decl */ static void do_notify_parent_cldstop(struct task_struct *tsk, - struct task_struct *parent, + int to_self, int why); /* @@ -692,7 +692,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) { struct task_struct *t; - if (p->flags & SIGNAL_GROUP_EXIT) + if (p->signal->flags & SIGNAL_GROUP_EXIT) /* * The process is in the middle of dying already. */ @@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->group_stop_count = 0; p->signal->flags = SIGNAL_STOP_CONTINUED; spin_unlock(&p->sighand->siglock); - if (p->ptrace & PT_PTRACED) - do_notify_parent_cldstop(p, p->parent, - CLD_STOPPED); - else - do_notify_parent_cldstop( - p->group_leader, - p->group_leader->real_parent, - CLD_STOPPED); + do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); spin_lock(&p->sighand->siglock); } rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); @@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->flags = SIGNAL_STOP_CONTINUED; p->signal->group_exit_code = 0; spin_unlock(&p->sighand->siglock); - if (p->ptrace & PT_PTRACED) - do_notify_parent_cldstop(p, p->parent, - CLD_CONTINUED); - else - do_notify_parent_cldstop( - p->group_leader, - p->group_leader->real_parent, - CLD_CONTINUED); + do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); spin_lock(&p->sighand->siglock); } else { /* @@ -1380,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) unsigned long flags; int ret = 0; - /* - * We need the tasklist lock even for the specific - * thread case (when we don't need to follow the group - * lists) in order to avoid races with "p->sighand" - * going away or changing from under us. - */ BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - read_lock(&tasklist_lock); + read_lock(&tasklist_lock); + + if (unlikely(p->flags & PF_EXITING)) { + ret = -1; + goto out_err; + } + spin_lock_irqsave(&p->sighand->siglock, flags); - + if (unlikely(!list_empty(&q->list))) { /* * If an SI_TIMER entry is already queue just increment @@ -1399,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) BUG(); q->info.si_overrun++; goto out; - } + } /* Short-circuit ignored signals. */ if (sig_ignored(p, sig)) { ret = 1; @@ -1414,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) out: spin_unlock_irqrestore(&p->sighand->siglock, flags); +out_err: read_unlock(&tasklist_lock); - return(ret); + + return ret; } int @@ -1542,14 +1530,20 @@ void do_notify_parent(struct task_struct *tsk, int sig) spin_unlock_irqrestore(&psig->siglock, flags); } -static void -do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, - int why) +static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) { struct siginfo info; unsigned long flags; + struct task_struct *parent; struct sighand_struct *sighand; + if (to_self) + parent = tsk->parent; + else { + tsk = tsk->group_leader; + parent = tsk->real_parent; + } + info.si_signo = SIGCHLD; info.si_errno = 0; info.si_pid = tsk->pid; @@ -1618,8 +1612,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) !(current->ptrace & PT_ATTACHED)) && (likely(current->parent->signal != current->signal) || !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { - do_notify_parent_cldstop(current, current->parent, - CLD_TRAPPED); + do_notify_parent_cldstop(current, 1, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); } else { @@ -1668,25 +1661,25 @@ void ptrace_notify(int exit_code) static void finish_stop(int stop_count) { + int to_self; + /* * If there are no other threads in the group, or if there is * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, current->parent, - CLD_STOPPED); - read_unlock(&tasklist_lock); - } - else if (stop_count == 0) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, - current->group_leader->real_parent, - CLD_STOPPED); - read_unlock(&tasklist_lock); - } + if (stop_count < 0 || (current->ptrace & PT_PTRACED)) + to_self = 1; + else if (stop_count == 0) + to_self = 0; + else + goto out; + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, to_self, CLD_STOPPED); + read_unlock(&tasklist_lock); + +out: schedule(); /* * Now we don't run again until continued. @@ -2228,8 +2221,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - current->state = TASK_INTERRUPTIBLE; - timeout = schedule_timeout(timeout); + timeout = schedule_timeout_interruptible(timeout); try_to_freeze(); spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/softirq.c b/kernel/softirq.c index b4ab6af1dea8..f766b2fc48be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void) cpu = smp_processor_id(); restart: /* Reset the pending bitmask before enabling irqs */ - local_softirq_pending() = 0; + set_softirq_pending(0); local_irq_enable(); diff --git a/kernel/softlockup.c b/kernel/softlockup.c new file mode 100644 index 000000000000..75976209cea7 --- /dev/null +++ b/kernel/softlockup.c @@ -0,0 +1,151 @@ +/* + * Detect Soft Lockups + * + * started by Ingo Molnar, (C) 2005, Red Hat + * + * this code detects soft lockups: incidents in where on a CPU + * the kernel does not reschedule for 10 seconds or more. + */ + +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/notifier.h> +#include <linux/module.h> + +static DEFINE_SPINLOCK(print_lock); + +static DEFINE_PER_CPU(unsigned long, timestamp) = 0; +static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; +static DEFINE_PER_CPU(struct task_struct *, watchdog_task); + +static int did_panic = 0; +static int softlock_panic(struct notifier_block *this, unsigned long event, + void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = softlock_panic, +}; + +void touch_softlockup_watchdog(void) +{ + per_cpu(timestamp, raw_smp_processor_id()) = jiffies; +} +EXPORT_SYMBOL(touch_softlockup_watchdog); + +/* + * This callback runs from the timer interrupt, and checks + * whether the watchdog thread has hung or not: + */ +void softlockup_tick(struct pt_regs *regs) +{ + int this_cpu = smp_processor_id(); + unsigned long timestamp = per_cpu(timestamp, this_cpu); + + if (per_cpu(print_timestamp, this_cpu) == timestamp) + return; + + /* Do not cause a second panic when there already was one */ + if (did_panic) + return; + + if (time_after(jiffies, timestamp + 10*HZ)) { + per_cpu(print_timestamp, this_cpu) = timestamp; + + spin_lock(&print_lock); + printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", + this_cpu); + show_regs(regs); + spin_unlock(&print_lock); + } +} + +/* + * The watchdog thread - runs every second and touches the timestamp. + */ +static int watchdog(void * __bind_cpu) +{ + struct sched_param param = { .sched_priority = 99 }; + int this_cpu = (long) __bind_cpu; + + printk("softlockup thread %d started up.\n", this_cpu); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); + + /* + * Run briefly once per second - if this gets delayed for + * more than 10 seconds then the debug-printout triggers + * in softlockup_tick(): + */ + while (!kthread_should_stop()) { + msleep_interruptible(1000); + touch_softlockup_watchdog(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __devinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + BUG_ON(per_cpu(watchdog_task, hotcpu)); + p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); + if (IS_ERR(p)) { + printk("watchdog for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(watchdog_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(watchdog_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + p = per_cpu(watchdog_task, hotcpu); + per_cpu(watchdog_task, hotcpu) = NULL; + kthread_stop(p); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init void spawn_softlockup_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + notifier_chain_register(&panic_notifier_list, &panic_block); +} + diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0c3f9d8bbe17..0375fcd5921d 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -3,7 +3,10 @@ * * Author: Zwane Mwaikambo <zwane@fsmlabs.com> * - * Copyright (2004) Ingo Molnar + * Copyright (2004, 2005) Ingo Molnar + * + * This file contains the spinlock/rwlock implementations for the + * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) */ #include <linux/config.h> @@ -17,12 +20,12 @@ * Generic declaration of the raw read_trylock() function, * architectures are supposed to optimize this: */ -int __lockfunc generic_raw_read_trylock(rwlock_t *lock) +int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) { - _raw_read_lock(lock); + __raw_read_lock(lock); return 1; } -EXPORT_SYMBOL(generic_raw_read_trylock); +EXPORT_SYMBOL(generic__raw_read_trylock); int __lockfunc _spin_trylock(spinlock_t *lock) { @@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock) } EXPORT_SYMBOL(_write_trylock); -#ifndef CONFIG_PREEMPT +#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) void __lockfunc _read_lock(rwlock_t *lock) { @@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) local_irq_save(flags); preempt_disable(); - _raw_spin_lock_flags(lock, flags); + _raw_spin_lock_flags(lock, &flags); return flags; } EXPORT_SYMBOL(_spin_lock_irqsave); diff --git a/kernel/sys.c b/kernel/sys.c index 9a24374c23bc..c80412be2302 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -361,6 +361,64 @@ out_unlock: return retval; } +void emergency_restart(void) +{ + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart(char *cmd) +{ + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + device_shutdown(); + if (!cmd) { + printk(KERN_EMERG "Restarting system.\n"); + } else { + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + } + printk(".\n"); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +void kernel_kexec(void) +{ +#ifdef CONFIG_KEXEC + struct kimage *image; + image = xchg(&kexec_image, 0); + if (!image) { + return; + } + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + system_state = SYSTEM_RESTART; + device_shutdown(); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + machine_kexec(image); +#endif +} +EXPORT_SYMBOL_GPL(kernel_kexec); + +void kernel_halt(void) +{ + notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); + system_state = SYSTEM_HALT; + device_shutdown(); + printk(KERN_EMERG "System halted.\n"); + machine_halt(); +} +EXPORT_SYMBOL_GPL(kernel_halt); + +void kernel_power_off(void) +{ + notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); + system_state = SYSTEM_POWER_OFF; + device_shutdown(); + printk(KERN_EMERG "Power down.\n"); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); /* * Reboot system call: for obvious reasons only root may call it, @@ -389,11 +447,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user lock_kernel(); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_state = SYSTEM_RESTART; - device_shutdown(); - printk(KERN_EMERG "Restarting system.\n"); - machine_restart(NULL); + kernel_restart(NULL); break; case LINUX_REBOOT_CMD_CAD_ON: @@ -405,23 +459,13 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user break; case LINUX_REBOOT_CMD_HALT: - notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); - system_state = SYSTEM_HALT; - device_suspend(PMSG_SUSPEND); - device_shutdown(); - printk(KERN_EMERG "System halted.\n"); - machine_halt(); + kernel_halt(); unlock_kernel(); do_exit(0); break; case LINUX_REBOOT_CMD_POWER_OFF: - notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); - system_state = SYSTEM_POWER_OFF; - device_suspend(PMSG_SUSPEND); - device_shutdown(); - printk(KERN_EMERG "Power down.\n"); - machine_power_off(); + kernel_power_off(); unlock_kernel(); do_exit(0); break; @@ -433,32 +477,14 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user } buffer[sizeof(buffer) - 1] = '\0'; - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); - system_state = SYSTEM_RESTART; - device_suspend(PMSG_FREEZE); - device_shutdown(); - printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); - machine_restart(buffer); + kernel_restart(buffer); break; -#ifdef CONFIG_KEXEC case LINUX_REBOOT_CMD_KEXEC: - { - struct kimage *image; - image = xchg(&kexec_image, 0); - if (!image) { - unlock_kernel(); - return -EINVAL; - } - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_state = SYSTEM_RESTART; - device_shutdown(); - printk(KERN_EMERG "Starting new kernel\n"); - machine_shutdown(); - machine_kexec(image); - break; - } -#endif + kernel_kexec(); + unlock_kernel(); + return -EINVAL; + #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { @@ -478,8 +504,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user static void deferred_cad(void *dummy) { - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - machine_restart(NULL); + kernel_restart(NULL); } /* @@ -1686,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { long error; - int sig; error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error) @@ -1694,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, switch (option) { case PR_SET_PDEATHSIG: - sig = arg2; - if (!valid_signal(sig)) { + if (!valid_signal(arg2)) { error = -EINVAL; break; } - current->pdeath_signal = sig; + current->pdeath_signal = arg2; break; case PR_GET_PDEATHSIG: error = put_user(current->pdeath_signal, (int __user *)arg2); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 29196ce9b40f..1ab2370e2efa 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -79,7 +79,9 @@ cond_syscall(sys_request_key); cond_syscall(sys_keyctl); cond_syscall(compat_sys_keyctl); cond_syscall(compat_sys_socketcall); -cond_syscall(sys_set_zone_reclaim); +cond_syscall(sys_inotify_init); +cond_syscall(sys_inotify_add_watch); +cond_syscall(sys_inotify_rm_watch); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 270ee7fadbd8..8e56e2495542 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,6 +31,7 @@ #include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/net.h> #include <linux/sysrq.h> #include <linux/highuid.h> #include <linux/writeback.h> @@ -114,6 +115,7 @@ extern int unaligned_enabled; extern int sysctl_ieee_emulation_warnings; #endif extern int sysctl_userprocess_debug; +extern int spin_retry; #endif extern int sysctl_hz_timer; @@ -135,9 +137,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -#ifdef CONFIG_NET -extern ctl_table net_table[]; -#endif static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; @@ -146,6 +145,9 @@ extern ctl_table random_table[]; #ifdef CONFIG_UNIX98_PTYS extern ctl_table pty_table[]; #endif +#ifdef CONFIG_INOTIFY +extern ctl_table inotify_table[]; +#endif #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT int sysctl_legacy_va_layout; @@ -218,6 +220,7 @@ static ctl_table root_table[] = { .mode = 0555, .child = dev_table, }, + { .ctl_name = 0 } }; @@ -643,7 +646,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - +#if defined(CONFIG_ARCH_S390) + { + .ctl_name = KERN_SPIN_RETRY, + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; @@ -950,6 +962,14 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, +#ifdef CONFIG_INOTIFY + { + .ctl_name = FS_INOTIFY, + .procname = "inotify", + .mode = 0555, + .child = inotify_table, + }, +#endif #endif { .ctl_name = KERN_SETUID_DUMPABLE, @@ -968,7 +988,7 @@ static ctl_table debug_table[] = { static ctl_table dev_table[] = { { .ctl_name = 0 } -}; +}; extern void init_irq_proc (void); diff --git a/kernel/time.c b/kernel/time.c index d4335c1c884c..dd5ae1162a8f 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -128,7 +128,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ -inline static void warp_clock(void) +static inline void warp_clock(void) { write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; diff --git a/kernel/timer.c b/kernel/timer.c index f2a11887a726..3ba10fa35b60 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs) { jiffies_64++; update_times(); + softlockup_tick(regs); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1023,7 +1024,7 @@ asmlinkage long sys_getppid(void) parent = me->group_leader->real_parent; for (;;) { pid = parent->tgid; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) { struct task_struct *old = parent; @@ -1150,9 +1151,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout) out: return timeout < 0 ? 0 : timeout; } - EXPORT_SYMBOL(schedule_timeout); +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { @@ -1169,8 +1187,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) if (!time_after(expire, now)) return 0; - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire - now); + expire = schedule_timeout_interruptible(expire - now); ret = 0; if (expire) { @@ -1198,8 +1215,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us return -EINVAL; expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); - current->state = TASK_INTERRUPTIBLE; - expire = schedule_timeout(expire); + expire = schedule_timeout_interruptible(expire); ret = 0; if (expire) { @@ -1428,7 +1444,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src) } } -static inline u64 time_interpolator_get_counter(void) +static inline u64 time_interpolator_get_counter(int writelock) { unsigned int src = time_interpolator->source; @@ -1442,6 +1458,15 @@ static inline u64 time_interpolator_get_counter(void) now = time_interpolator_get_cycles(src); if (lcycle && time_after(lcycle, now)) return lcycle; + + /* When holding the xtime write lock, there's no need + * to add the overhead of the cmpxchg. Readers are + * force to retry until the write lock is released. + */ + if (writelock) { + time_interpolator->last_cycle = now; + return now; + } /* Keep track of the last timer value returned. The use of cmpxchg here * will cause contention in an SMP environment. */ @@ -1455,7 +1480,7 @@ static inline u64 time_interpolator_get_counter(void) void time_interpolator_reset(void) { time_interpolator->offset = 0; - time_interpolator->last_counter = time_interpolator_get_counter(); + time_interpolator->last_counter = time_interpolator_get_counter(1); } #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) @@ -1467,7 +1492,7 @@ unsigned long time_interpolator_get_offset(void) return 0; return time_interpolator->offset + - GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); + GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); } #define INTERPOLATOR_ADJUST 65536 @@ -1490,7 +1515,7 @@ static void time_interpolator_update(long delta_nsec) * and the tuning logic insures that. */ - counter = time_interpolator_get_counter(); + counter = time_interpolator_get_counter(1); offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) @@ -1588,10 +1613,8 @@ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; - while (timeout) { - set_current_state(TASK_UNINTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); @@ -1604,10 +1627,8 @@ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; - while (timeout && !signal_pending(current)) { - set_current_state(TASK_INTERRUPTIBLE); - timeout = schedule_timeout(timeout); - } + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } diff --git a/kernel/user.c b/kernel/user.c index 734575d55769..89e562feb1b1 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -120,6 +120,10 @@ struct user_struct * alloc_uid(uid_t uid) atomic_set(&new->processes, 0); atomic_set(&new->files, 0); atomic_set(&new->sigpending, 0); +#ifdef CONFIG_INOTIFY + atomic_set(&new->inotify_watches, 0); + atomic_set(&new->inotify_devs, 0); +#endif new->mq_bytes = 0; new->locked_shm = 0; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 259cf55da3c9..91bacb13a7e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -308,12 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name, struct workqueue_struct *wq; struct task_struct *p; - BUG_ON(strlen(name) > 10); - - wq = kmalloc(sizeof(*wq), GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; - memset(wq, 0, sizeof(*wq)); wq->name = name; /* We don't need the distraction of CPUs appearing and vanishing. */ @@ -501,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: /* Create a new workqueue thread for it. */ list_for_each_entry(wq, &workqueues, list) { - if (create_workqueue_thread(wq, hotcpu) < 0) { + if (!create_workqueue_thread(wq, hotcpu)) { printk("workqueue for %i failed\n", hotcpu); return NOTIFY_BAD; } |
