summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c45
-rw-r--r--kernel/audit.c131
-rw-r--r--kernel/auditsc.c327
-rw-r--r--kernel/capability.c20
-rw-r--r--kernel/compat.c9
-rw-r--r--kernel/cpuset.c297
-rw-r--r--kernel/crash_dump.c11
-rw-r--r--kernel/exit.c30
-rw-r--r--kernel/fork.c108
-rw-r--r--kernel/futex.c137
-rw-r--r--kernel/intermodule.c3
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c14
-rw-r--r--kernel/itimer.c37
-rw-r--r--kernel/kprobes.c94
-rw-r--r--kernel/module.c59
-rw-r--r--kernel/panic.c9
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/posix-timers.c46
-rw-r--r--kernel/power/Kconfig15
-rw-r--r--kernel/power/disk.c74
-rw-r--r--kernel/power/main.c21
-rw-r--r--kernel/power/pm.c3
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c35
-rw-r--r--kernel/power/smp.c2
-rw-r--r--kernel/power/swsusp.c230
-rw-r--r--kernel/printk.c13
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c41
-rw-r--r--kernel/rcupdate.c14
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/sched.c634
-rw-r--r--kernel/signal.c88
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c151
-rw-r--r--kernel/spinlock.c15
-rw-r--r--kernel/sys.c115
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl.c30
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/timer.c57
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/workqueue.c7
46 files changed, 2175 insertions, 782 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..ff4dc02ce170 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += cpu.o spinlock.o
+obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868e..b756f527497e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -165,7 +165,7 @@ out:
}
/*
- * Close the old accouting file (if currently open) and then replace
+ * Close the old accounting file (if currently open) and then replace
* it with file (if non-NULL).
*
* NOTE: acct_globals.lock MUST be held on entry and exit.
@@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file)
}
}
-/*
- * sys_acct() is the only system call needed to implement process
- * accounting. It takes the name of the file where accounting records
- * should be written. If the filename is NULL, accounting will be
- * shutdown.
+/**
+ * sys_acct - enable/disable process accounting
+ * @name: file name for accounting records or NULL to shutdown accounting
+ *
+ * Returns 0 for success or negative errno values for failure.
+ *
+ * sys_acct() is the only system call needed to implement process
+ * accounting. It takes the name of the file where accounting records
+ * should be written. If the filename is NULL, accounting will be
+ * shutdown.
*/
asmlinkage long sys_acct(const char __user *name)
{
@@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name)
return (PTR_ERR(tmp));
}
/* Difference from BSD - they don't do O_APPEND */
- file = filp_open(tmp, O_WRONLY|O_APPEND, 0);
+ file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
putname(tmp);
if (IS_ERR(file)) {
return (PTR_ERR(file));
@@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name)
return (0);
}
-/*
- * If the accouting is turned on for a file in the filesystem pointed
- * to by sb, turn accouting off.
+/**
+ * acct_auto_close - turn off a filesystem's accounting if it is on
+ * @sb: super block for the filesystem
+ *
+ * If the accounting is turned on for a file in the filesystem pointed
+ * to by sb, turn accounting off.
*/
void acct_auto_close(struct super_block *sb)
{
@@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file)
set_fs(fs);
}
-/*
+/**
* acct_process - now just a wrapper around do_acct_process
+ * @exitcode: task exit code
+ *
+ * handles process accounting for an exiting task
*/
void acct_process(long exitcode)
{
@@ -530,9 +541,9 @@ void acct_process(long exitcode)
}
-/*
- * acct_update_integrals
- * - update mm integral fields in task_struct
+/**
+ * acct_update_integrals - update mm integral fields in task_struct
+ * @tsk: task_struct for accounting
*/
void acct_update_integrals(struct task_struct *tsk)
{
@@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk)
}
}
-/*
- * acct_clear_integrals
- * - clear the mm integral fields in task_struct
+/**
+ * acct_clear_integrals - clear the mm integral fields in task_struct
+ * @tsk: task_struct whose accounting fields are cleared
*/
void acct_clear_integrals(struct task_struct *tsk)
{
diff --git a/kernel/audit.c b/kernel/audit.c
index ef35166fdc29..83096b67510a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -79,6 +79,8 @@ static int audit_rate_limit;
/* Number of outstanding audit_buffers allowed. */
static int audit_backlog_limit = 64;
+static int audit_backlog_wait_time = 60 * HZ;
+static int audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
uid_t audit_sig_uid = -1;
@@ -106,18 +108,12 @@ static LIST_HEAD(audit_freelist);
static struct sk_buff_head audit_skb_queue;
static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
-
-/* There are three lists of rules -- one to search at task creation
- * time, one to search at syscall entry time, and another to search at
- * syscall exit time. */
-static LIST_HEAD(audit_tsklist);
-static LIST_HEAD(audit_entlist);
-static LIST_HEAD(audit_extlist);
+static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
/* The netlink socket is only to be read by 1 CPU, which lets us assume
* that list additions and deletions never happen simultaneously in
* auditsc.c */
-static DECLARE_MUTEX(audit_netlink_sem);
+DECLARE_MUTEX(audit_netlink_sem);
/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
* audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -137,6 +133,7 @@ struct audit_buffer {
struct list_head list;
struct sk_buff *skb; /* formatted skb ready to send */
struct audit_context *ctx; /* NULL or associated context */
+ int gfp_mask;
};
static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -145,11 +142,6 @@ static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
nlh->nlmsg_pid = pid;
}
-struct audit_entry {
- struct list_head list;
- struct audit_rule rule;
-};
-
static void audit_panic(const char *message)
{
switch (audit_failure)
@@ -233,7 +225,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid)
{
int old = audit_rate_limit;
audit_rate_limit = limit;
- audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
"audit_rate_limit=%d old=%d by auid=%u",
audit_rate_limit, old, loginuid);
return old;
@@ -243,7 +235,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid)
{
int old = audit_backlog_limit;
audit_backlog_limit = limit;
- audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
"audit_backlog_limit=%d old=%d by auid=%u",
audit_backlog_limit, old, loginuid);
return old;
@@ -255,7 +247,7 @@ static int audit_set_enabled(int state, uid_t loginuid)
if (state != 0 && state != 1)
return -EINVAL;
audit_enabled = state;
- audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
"audit_enabled=%d old=%d by auid=%u",
audit_enabled, old, loginuid);
return old;
@@ -269,7 +261,7 @@ static int audit_set_failure(int state, uid_t loginuid)
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
audit_failure = state;
- audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
"audit_failure=%d old=%d by auid=%u",
audit_failure, old, loginuid);
return old;
@@ -281,6 +273,7 @@ int kauditd_thread(void *dummy)
while (1) {
skb = skb_dequeue(&audit_skb_queue);
+ wake_up(&audit_backlog_wait);
if (skb) {
if (audit_pid) {
int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
@@ -290,7 +283,7 @@ int kauditd_thread(void *dummy)
audit_pid = 0;
}
} else {
- printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0));
+ printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0));
kfree_skb(skb);
}
} else {
@@ -423,7 +416,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (status_get->mask & AUDIT_STATUS_PID) {
int old = audit_pid;
audit_pid = status_get->pid;
- audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
"audit_pid=%d old=%d by auid=%u",
audit_pid, old, loginuid);
}
@@ -435,15 +428,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
- ab = audit_log_start(NULL, msg_type);
- if (!ab)
- break; /* audit_panic has been called */
- audit_log_format(ab,
- "user pid=%d uid=%u auid=%u"
- " msg='%.1024s'",
- pid, uid, loginuid, (char *)data);
- audit_set_pid(ab, pid);
- audit_log_end(ab);
+ if (!audit_enabled && msg_type != AUDIT_USER_AVC)
+ return 0;
+
+ err = audit_filter_user(&NETLINK_CB(skb), msg_type);
+ if (err == 1) {
+ err = 0;
+ ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+ if (ab) {
+ audit_log_format(ab,
+ "user pid=%d uid=%u auid=%u msg='%.1024s'",
+ pid, uid, loginuid, (char *)data);
+ audit_set_pid(ab, pid);
+ audit_log_end(ab);
+ }
+ }
break;
case AUDIT_ADD:
case AUDIT_DEL:
@@ -514,7 +513,8 @@ static int __init audit_init(void)
{
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
- audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+ audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
+ THIS_MODULE);
if (!audit_sock)
audit_panic("cannot initialize netlink socket");
@@ -522,7 +522,7 @@ static int __init audit_init(void)
skb_queue_head_init(&audit_skb_queue);
audit_initialized = 1;
audit_enabled = audit_default;
- audit_log(NULL, AUDIT_KERNEL, "initialized");
+ audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
return 0;
}
__initcall(audit_init);
@@ -560,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
}
static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
- int gfp_mask, int type)
+ unsigned int __nocast gfp_mask, int type)
{
unsigned long flags;
struct audit_buffer *ab = NULL;
@@ -586,6 +586,7 @@ static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
goto err;
ab->ctx = ctx;
+ ab->gfp_mask = gfp_mask;
nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
nlh->nlmsg_type = type;
nlh->nlmsg_flags = 0;
@@ -605,26 +606,27 @@ err:
* (timestamp,serial) tuple is unique for each syscall and is live from
* syscall entry to syscall exit.
*
- * Atomic values are only guaranteed to be 24-bit, so we count down.
- *
* NOTE: Another possibility is to store the formatted records off the
* audit context (for those records that have a context), and emit them
* all at syscall exit. However, this could delay the reporting of
* significant errors until syscall exit (or never, if the system
* halts). */
+
unsigned int audit_serial(void)
{
- static atomic_t serial = ATOMIC_INIT(0xffffff);
- unsigned int a, b;
+ static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+ static unsigned int serial = 0;
+
+ unsigned long flags;
+ unsigned int ret;
+ spin_lock_irqsave(&serial_lock, flags);
do {
- a = atomic_read(&serial);
- if (atomic_dec_and_test(&serial))
- atomic_set(&serial, 0xffffff);
- b = atomic_read(&serial);
- } while (b != a - 1);
+ ret = ++serial;
+ } while (unlikely(!ret));
+ spin_unlock_irqrestore(&serial_lock, flags);
- return 0xffffff - b;
+ return ret;
}
static inline void audit_get_stamp(struct audit_context *ctx,
@@ -644,17 +646,43 @@ static inline void audit_get_stamp(struct audit_context *ctx,
* syscall, then the syscall is marked as auditable and an audit record
* will be written at syscall exit. If there is no associated task, tsk
* should be NULL. */
-struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
+
+struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask,
+ int type)
{
struct audit_buffer *ab = NULL;
struct timespec t;
unsigned int serial;
+ int reserve;
+ unsigned long timeout_start = jiffies;
if (!audit_initialized)
return NULL;
- if (audit_backlog_limit
- && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
+ if (gfp_mask & __GFP_WAIT)
+ reserve = 0;
+ else
+ reserve = 5; /* Allow atomic callers to go up to five
+ entries over the normal backlog limit */
+
+ while (audit_backlog_limit
+ && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
+ if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
+ && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
+
+ /* Wait for auditd to drain the queue a little */
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&audit_backlog_wait, &wait);
+
+ if (audit_backlog_limit &&
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ continue;
+ }
if (audit_rate_check())
printk(KERN_WARNING
"audit: audit_backlog=%d > "
@@ -662,10 +690,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
skb_queue_len(&audit_skb_queue),
audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
+ audit_backlog_wait_time = audit_backlog_wait_overflow;
+ wake_up(&audit_backlog_wait);
return NULL;
}
- ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
+ ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
return NULL;
@@ -689,7 +719,7 @@ static inline int audit_expand(struct audit_buffer *ab, int extra)
{
struct sk_buff *skb = ab->skb;
int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
- GFP_ATOMIC);
+ ab->gfp_mask);
if (ret < 0) {
audit_log_lost("out of memory in audit_expand");
return 0;
@@ -808,7 +838,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
audit_log_format(ab, " %s", prefix);
/* We will allow 11 spaces for ' (deleted)' to be appended */
- path = kmalloc(PATH_MAX+11, GFP_KERNEL);
+ path = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!path) {
audit_log_format(ab, "<no memory>");
return;
@@ -840,7 +870,7 @@ void audit_log_end(struct audit_buffer *ab)
ab->skb = NULL;
wake_up_interruptible(&kauditd_wait);
} else {
- printk("%s\n", ab->skb->data + NLMSG_SPACE(0));
+ printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0));
}
}
audit_buffer_free(ab);
@@ -849,12 +879,13 @@ void audit_log_end(struct audit_buffer *ab)
/* Log an audit record. This is a convenience function that calls
* audit_log_start, audit_log_vformat, and audit_log_end. It may be
* called in any context. */
-void audit_log(struct audit_context *ctx, int type, const char *fmt, ...)
+void audit_log(struct audit_context *ctx, int gfp_mask, int type,
+ const char *fmt, ...)
{
struct audit_buffer *ab;
va_list args;
- ab = audit_log_start(ctx, type);
+ ab = audit_log_start(ctx, gfp_mask, type);
if (ab) {
va_start(args, fmt);
audit_log_vformat(ab, fmt, args);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e75f84e1a1a0..88696f639aab 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -39,6 +39,9 @@
#include <linux/audit.h>
#include <linux/personality.h>
#include <linux/time.h>
+#include <linux/kthread.h>
+#include <linux/netlink.h>
+#include <linux/compiler.h>
#include <asm/unistd.h>
/* 0 = no checking
@@ -95,6 +98,7 @@ struct audit_names {
uid_t uid;
gid_t gid;
dev_t rdev;
+ unsigned flags;
};
struct audit_aux_data {
@@ -167,9 +171,16 @@ struct audit_context {
/* There are three lists of rules -- one to search at task creation
* time, one to search at syscall entry time, and another to search at
* syscall exit time. */
-static LIST_HEAD(audit_tsklist);
-static LIST_HEAD(audit_entlist);
-static LIST_HEAD(audit_extlist);
+static struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
+ LIST_HEAD_INIT(audit_filter_list[0]),
+ LIST_HEAD_INIT(audit_filter_list[1]),
+ LIST_HEAD_INIT(audit_filter_list[2]),
+ LIST_HEAD_INIT(audit_filter_list[3]),
+ LIST_HEAD_INIT(audit_filter_list[4]),
+#if AUDIT_NR_FILTERS != 5
+#error Fix audit_filter_list initialiser
+#endif
+};
struct audit_entry {
struct list_head list;
@@ -179,9 +190,36 @@ struct audit_entry {
extern int audit_pid;
+/* Copy rule from user-space to kernel-space. Called from
+ * audit_add_rule during AUDIT_ADD. */
+static inline int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+ int i;
+
+ if (s->action != AUDIT_NEVER
+ && s->action != AUDIT_POSSIBLE
+ && s->action != AUDIT_ALWAYS)
+ return -1;
+ if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+ return -1;
+ if ((s->flags & ~AUDIT_FILTER_PREPEND) >= AUDIT_NR_FILTERS)
+ return -1;
+
+ d->flags = s->flags;
+ d->action = s->action;
+ d->field_count = s->field_count;
+ for (i = 0; i < d->field_count; i++) {
+ d->fields[i] = s->fields[i];
+ d->values[i] = s->values[i];
+ }
+ for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+ return 0;
+}
+
/* Check to see if two rules are identical. It is called from
+ * audit_add_rule during AUDIT_ADD and
* audit_del_rule during AUDIT_DEL. */
-static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+static inline int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
{
int i;
@@ -210,19 +248,37 @@ static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
/* Note that audit_add_rule and audit_del_rule are called via
* audit_receive() in audit.c, and are protected by
* audit_netlink_sem. */
-static inline int audit_add_rule(struct audit_entry *entry,
- struct list_head *list)
+static inline int audit_add_rule(struct audit_rule *rule,
+ struct list_head *list)
{
- if (entry->rule.flags & AUDIT_PREPEND) {
- entry->rule.flags &= ~AUDIT_PREPEND;
+ struct audit_entry *entry;
+
+ /* Do not use the _rcu iterator here, since this is the only
+ * addition routine. */
+ list_for_each_entry(entry, list, list) {
+ if (!audit_compare_rule(rule, &entry->rule)) {
+ return -EEXIST;
+ }
+ }
+
+ if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+ return -ENOMEM;
+ if (audit_copy_rule(&entry->rule, rule)) {
+ kfree(entry);
+ return -EINVAL;
+ }
+
+ if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
+ entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
list_add_rcu(&entry->list, list);
} else {
list_add_tail_rcu(&entry->list, list);
}
+
return 0;
}
-static void audit_free_rule(struct rcu_head *head)
+static inline void audit_free_rule(struct rcu_head *head)
{
struct audit_entry *e = container_of(head, struct audit_entry, rcu);
kfree(e);
@@ -245,82 +301,82 @@ static inline int audit_del_rule(struct audit_rule *rule,
return 0;
}
}
- return -EFAULT; /* No matching rule */
+ return -ENOENT; /* No matching rule */
}
-/* Copy rule from user-space to kernel-space. Called during
- * AUDIT_ADD. */
-static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+static int audit_list_rules(void *_dest)
{
+ int pid, seq;
+ int *dest = _dest;
+ struct audit_entry *entry;
int i;
- if (s->action != AUDIT_NEVER
- && s->action != AUDIT_POSSIBLE
- && s->action != AUDIT_ALWAYS)
- return -1;
- if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
- return -1;
+ pid = dest[0];
+ seq = dest[1];
+ kfree(dest);
- d->flags = s->flags;
- d->action = s->action;
- d->field_count = s->field_count;
- for (i = 0; i < d->field_count; i++) {
- d->fields[i] = s->fields[i];
- d->values[i] = s->values[i];
+ down(&audit_netlink_sem);
+
+ /* The *_rcu iterators not needed here because we are
+ always called with audit_netlink_sem held. */
+ for (i=0; i<AUDIT_NR_FILTERS; i++) {
+ list_for_each_entry(entry, &audit_filter_list[i], list)
+ audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+ &entry->rule, sizeof(entry->rule));
}
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+ audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+
+ up(&audit_netlink_sem);
return 0;
}
int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
uid_t loginuid)
{
- u32 flags;
- struct audit_entry *entry;
+ struct task_struct *tsk;
+ int *dest;
int err = 0;
+ unsigned listnr;
switch (type) {
case AUDIT_LIST:
- /* The *_rcu iterators not needed here because we are
- always called with audit_netlink_sem held. */
- list_for_each_entry(entry, &audit_tsklist, list)
- audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
- &entry->rule, sizeof(entry->rule));
- list_for_each_entry(entry, &audit_entlist, list)
- audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
- &entry->rule, sizeof(entry->rule));
- list_for_each_entry(entry, &audit_extlist, list)
- audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
- &entry->rule, sizeof(entry->rule));
- audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+ /* We can't just spew out the rules here because we might fill
+ * the available socket buffer space and deadlock waiting for
+ * auditctl to read from it... which isn't ever going to
+ * happen if we're actually running in the context of auditctl
+ * trying to _send_ the stuff */
+
+ dest = kmalloc(2 * sizeof(int), GFP_KERNEL);
+ if (!dest)
+ return -ENOMEM;
+ dest[0] = pid;
+ dest[1] = seq;
+
+ tsk = kthread_run(audit_list_rules, dest, "audit_list_rules");
+ if (IS_ERR(tsk)) {
+ kfree(dest);
+ err = PTR_ERR(tsk);
+ }
break;
case AUDIT_ADD:
- if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
- return -ENOMEM;
- if (audit_copy_rule(&entry->rule, data)) {
- kfree(entry);
+ listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
+ if (listnr >= AUDIT_NR_FILTERS)
return -EINVAL;
- }
- flags = entry->rule.flags;
- if (!err && (flags & AUDIT_PER_TASK))
- err = audit_add_rule(entry, &audit_tsklist);
- if (!err && (flags & AUDIT_AT_ENTRY))
- err = audit_add_rule(entry, &audit_entlist);
- if (!err && (flags & AUDIT_AT_EXIT))
- err = audit_add_rule(entry, &audit_extlist);
- audit_log(NULL, AUDIT_CONFIG_CHANGE,
- "auid=%u added an audit rule\n", loginuid);
+
+ err = audit_add_rule(data, &audit_filter_list[listnr]);
+ if (!err)
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+ "auid=%u added an audit rule\n", loginuid);
break;
case AUDIT_DEL:
- flags =((struct audit_rule *)data)->flags;
- if (!err && (flags & AUDIT_PER_TASK))
- err = audit_del_rule(data, &audit_tsklist);
- if (!err && (flags & AUDIT_AT_ENTRY))
- err = audit_del_rule(data, &audit_entlist);
- if (!err && (flags & AUDIT_AT_EXIT))
- err = audit_del_rule(data, &audit_extlist);
- audit_log(NULL, AUDIT_CONFIG_CHANGE,
- "auid=%u removed an audit rule\n", loginuid);
+ listnr =((struct audit_rule *)data)->flags & ~AUDIT_FILTER_PREPEND;
+ if (listnr >= AUDIT_NR_FILTERS)
+ return -EINVAL;
+
+ err = audit_del_rule(data, &audit_filter_list[listnr]);
+ if (!err)
+ audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+ "auid=%u removed an audit rule\n", loginuid);
break;
default:
return -EINVAL;
@@ -384,8 +440,12 @@ static int audit_filter_rules(struct task_struct *tsk,
result = (ctx->return_code == value);
break;
case AUDIT_SUCCESS:
- if (ctx && ctx->return_valid)
- result = (ctx->return_valid == AUDITSC_SUCCESS);
+ if (ctx && ctx->return_valid) {
+ if (value)
+ result = (ctx->return_valid == AUDITSC_SUCCESS);
+ else
+ result = (ctx->return_valid == AUDITSC_FAILURE);
+ }
break;
case AUDIT_DEVMAJOR:
if (ctx) {
@@ -454,7 +514,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
enum audit_state state;
rcu_read_lock();
- list_for_each_entry_rcu(e, &audit_tsklist, list) {
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
rcu_read_unlock();
return state;
@@ -474,20 +534,84 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
struct list_head *list)
{
struct audit_entry *e;
+ enum audit_state state;
+
+ if (audit_pid && tsk->tgid == audit_pid)
+ return AUDIT_DISABLED;
+
+ rcu_read_lock();
+ if (!list_empty(list)) {
+ int word = AUDIT_WORD(ctx->major);
+ int bit = AUDIT_BIT(ctx->major);
+
+ list_for_each_entry_rcu(e, list, list) {
+ if ((e->rule.mask[word] & bit) == bit
+ && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+ rcu_read_unlock();
+ return state;
+ }
+ }
+ }
+ rcu_read_unlock();
+ return AUDIT_BUILD_CONTEXT;
+}
+
+static int audit_filter_user_rules(struct netlink_skb_parms *cb,
+ struct audit_rule *rule,
+ enum audit_state *state)
+{
+ int i;
+
+ for (i = 0; i < rule->field_count; i++) {
+ u32 field = rule->fields[i] & ~AUDIT_NEGATE;
+ u32 value = rule->values[i];
+ int result = 0;
+
+ switch (field) {
+ case AUDIT_PID:
+ result = (cb->creds.pid == value);
+ break;
+ case AUDIT_UID:
+ result = (cb->creds.uid == value);
+ break;
+ case AUDIT_GID:
+ result = (cb->creds.gid == value);
+ break;
+ case AUDIT_LOGINUID:
+ result = (cb->loginuid == value);
+ break;
+ }
+
+ if (rule->fields[i] & AUDIT_NEGATE)
+ result = !result;
+ if (!result)
+ return 0;
+ }
+ switch (rule->action) {
+ case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
+ case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
+ case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
+ }
+ return 1;
+}
+
+int audit_filter_user(struct netlink_skb_parms *cb, int type)
+{
+ struct audit_entry *e;
enum audit_state state;
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
+ int ret = 1;
rcu_read_lock();
- list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit
- && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
- rcu_read_unlock();
- return state;
+ list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
+ if (audit_filter_user_rules(cb, &e->rule, &state)) {
+ if (state == AUDIT_DISABLED)
+ ret = 0;
+ break;
}
}
rcu_read_unlock();
- return AUDIT_BUILD_CONTEXT;
+
+ return ret; /* Audit by default */
}
/* This should be called with task_lock() held. */
@@ -504,7 +628,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
if (context->in_syscall && !context->auditable) {
enum audit_state state;
- state = audit_filter_syscall(tsk, context, &audit_extlist);
+ state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
if (state == AUDIT_RECORD_CONTEXT)
context->auditable = 1;
}
@@ -679,13 +803,13 @@ static void audit_log_task_info(struct audit_buffer *ab)
up_read(&mm->mmap_sem);
}
-static void audit_log_exit(struct audit_context *context)
+static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask)
{
int i;
struct audit_buffer *ab;
struct audit_aux_data *aux;
- ab = audit_log_start(context, AUDIT_SYSCALL);
+ ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
if (!ab)
return; /* audit_panic has been called */
audit_log_format(ab, "arch=%x syscall=%d",
@@ -717,7 +841,7 @@ static void audit_log_exit(struct audit_context *context)
for (aux = context->aux; aux; aux = aux->next) {
- ab = audit_log_start(context, aux->type);
+ ab = audit_log_start(context, GFP_KERNEL, aux->type);
if (!ab)
continue; /* audit_panic has been called */
@@ -754,14 +878,14 @@ static void audit_log_exit(struct audit_context *context)
}
if (context->pwd && context->pwdmnt) {
- ab = audit_log_start(context, AUDIT_CWD);
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
audit_log_end(ab);
}
}
for (i = 0; i < context->name_count; i++) {
- ab = audit_log_start(context, AUDIT_PATH);
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
if (!ab)
continue; /* audit_panic has been called */
@@ -770,6 +894,8 @@ static void audit_log_exit(struct audit_context *context)
audit_log_format(ab, " name=");
audit_log_untrustedstring(ab, context->names[i].name);
}
+ audit_log_format(ab, " flags=%x\n", context->names[i].flags);
+
if (context->names[i].ino != (unsigned long)-1)
audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
" ouid=%u ogid=%u rdev=%02x:%02x",
@@ -799,9 +925,11 @@ void audit_free(struct task_struct *tsk)
return;
/* Check for system calls that do not go through the exit
- * function (e.g., exit_group), then free context block. */
- if (context->in_syscall && context->auditable && context->pid != audit_pid)
- audit_log_exit(context);
+ * function (e.g., exit_group), then free context block.
+ * We use GFP_ATOMIC here because we might be doing this
+ * in the context of the idle thread */
+ if (context->in_syscall && context->auditable)
+ audit_log_exit(context, GFP_ATOMIC);
audit_free_context(context);
}
@@ -876,11 +1004,11 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
state = context->state;
if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
- state = audit_filter_syscall(tsk, context, &audit_entlist);
+ state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
if (likely(state == AUDIT_DISABLED))
return;
- context->serial = audit_serial();
+ context->serial = 0;
context->ctime = CURRENT_TIME;
context->in_syscall = 1;
context->auditable = !!(state == AUDIT_RECORD_CONTEXT);
@@ -903,10 +1031,10 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
/* Not having a context here is ok, since the parent may have
* called __put_task_struct. */
if (likely(!context))
- return;
+ goto out;
- if (context->in_syscall && context->auditable && context->pid != audit_pid)
- audit_log_exit(context);
+ if (context->in_syscall && context->auditable)
+ audit_log_exit(context, GFP_KERNEL);
context->in_syscall = 0;
context->auditable = 0;
@@ -919,9 +1047,9 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
} else {
audit_free_names(context);
audit_free_aux(context);
- audit_zero_context(context, context->state);
tsk->audit_context = context;
}
+ out:
put_task_struct(tsk);
}
@@ -996,7 +1124,7 @@ void audit_putname(const char *name)
/* Store the inode and device from a lookup. Called from
* fs/namei.c:path_lookup(). */
-void audit_inode(const char *name, const struct inode *inode)
+void audit_inode(const char *name, const struct inode *inode, unsigned flags)
{
int idx;
struct audit_context *context = current->audit_context;
@@ -1022,17 +1150,20 @@ void audit_inode(const char *name, const struct inode *inode)
++context->ino_count;
#endif
}
- context->names[idx].ino = inode->i_ino;
- context->names[idx].dev = inode->i_sb->s_dev;
- context->names[idx].mode = inode->i_mode;
- context->names[idx].uid = inode->i_uid;
- context->names[idx].gid = inode->i_gid;
- context->names[idx].rdev = inode->i_rdev;
+ context->names[idx].flags = flags;
+ context->names[idx].ino = inode->i_ino;
+ context->names[idx].dev = inode->i_sb->s_dev;
+ context->names[idx].mode = inode->i_mode;
+ context->names[idx].uid = inode->i_uid;
+ context->names[idx].gid = inode->i_gid;
+ context->names[idx].rdev = inode->i_rdev;
}
void auditsc_get_stamp(struct audit_context *ctx,
struct timespec *t, unsigned int *serial)
{
+ if (!ctx->serial)
+ ctx->serial = audit_serial();
t->tv_sec = ctx->ctime.tv_sec;
t->tv_nsec = ctx->ctime.tv_nsec;
*serial = ctx->serial;
@@ -1044,7 +1175,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
if (task->audit_context) {
struct audit_buffer *ab;
- ab = audit_log_start(NULL, AUDIT_LOGIN);
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
if (ab) {
audit_log_format(ab, "login pid=%d uid=%u "
"old auid=%u new auid=%u",
@@ -1153,7 +1284,7 @@ void audit_signal_info(int sig, struct task_struct *t)
extern pid_t audit_sig_pid;
extern uid_t audit_sig_uid;
- if (unlikely(audit_pid && t->pid == audit_pid)) {
+ if (unlikely(audit_pid && t->tgid == audit_pid)) {
if (sig == SIGTERM || sig == SIGHUP) {
struct audit_context *ctx = current->audit_context;
audit_sig_pid = current->pid;
diff --git a/kernel/capability.c b/kernel/capability.c
index 64db1ee820c2..8986a37a67ea 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -31,8 +31,14 @@ static DEFINE_SPINLOCK(task_capability_lock);
* uninteresting and/or not to be changed.
*/
-/*
+/**
* sys_capget - get the capabilities of a given process.
+ * @header: pointer to struct that contains capability version and
+ * target pid data
+ * @dataptr: pointer to struct that contains the effective, permitted,
+ * and inheritable capabilities that are returned
+ *
+ * Returns 0 on success and < 0 on error.
*/
asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
{
@@ -141,8 +147,14 @@ static inline int cap_set_all(kernel_cap_t *effective,
return ret;
}
-/*
- * sys_capset - set capabilities for a given process, all processes, or all
+/**
+ * sys_capset - set capabilities for a process or a group of processes
+ * @header: pointer to struct that contains capability version and
+ * target pid data
+ * @data: pointer to struct that contains the effective, permitted,
+ * and inheritable capabilities
+ *
+ * Set capabilities for a given process, all processes, or all
* processes in a given process group.
*
* The restrictions on setting capabilities are specified as:
@@ -152,6 +164,8 @@ static inline int cap_set_all(kernel_cap_t *effective,
* I: any raised capabilities must be a subset of the (old current) permitted
* P: any raised capabilities must be a subset of the (old current) permitted
* E: must be set to a subset of (new target) permitted
+ *
+ * Returns 0 on success and < 0 on error.
*/
asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
{
diff --git a/kernel/compat.c b/kernel/compat.c
index ddfcaaa86623..102296e21ea8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
if (!time_after(expire, now))
return 0;
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire - now);
+ expire = schedule_timeout_interruptible(expire - now);
if (expire == 0)
return 0;
@@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
return -EINVAL;
expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire);
+ expire = schedule_timeout_interruptible(expire);
if (expire == 0)
return 0;
@@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
- current->state = TASK_INTERRUPTIBLE;
- timeout = schedule_timeout(timeout);
+ timeout = schedule_timeout_interruptible(timeout);
spin_lock_irq(&current->sighand->siglock);
sig = dequeue_signal(current, &s, &info);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 984c0bf3807f..79866bc6b3a1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL;
*/
static DECLARE_MUTEX(cpuset_sem);
+static struct task_struct *cpuset_sem_owner;
+static int cpuset_sem_depth;
+
+/*
+ * The global cpuset semaphore cpuset_sem can be needed by the
+ * memory allocator to update a tasks mems_allowed (see the calls
+ * to cpuset_update_current_mems_allowed()) or to walk up the
+ * cpuset hierarchy to find a mem_exclusive cpuset see the calls
+ * to cpuset_excl_nodes_overlap()).
+ *
+ * But if the memory allocation is being done by cpuset.c code, it
+ * usually already holds cpuset_sem. Double tripping on a kernel
+ * semaphore deadlocks the current task, and any other task that
+ * subsequently tries to obtain the lock.
+ *
+ * Run all up's and down's on cpuset_sem through the following
+ * wrappers, which will detect this nested locking, and avoid
+ * deadlocking.
+ */
+
+static inline void cpuset_down(struct semaphore *psem)
+{
+ if (cpuset_sem_owner != current) {
+ down(psem);
+ cpuset_sem_owner = current;
+ }
+ cpuset_sem_depth++;
+}
+
+static inline void cpuset_up(struct semaphore *psem)
+{
+ if (--cpuset_sem_depth == 0) {
+ cpuset_sem_owner = NULL;
+ up(psem);
+ }
+}
/*
* A couple of forward declarations required, due to cyclic reference loop:
@@ -398,21 +434,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
* to continue to serve a useful existence. Next time it's released,
* we will get notified again, if it still has 'notify_on_release' set.
*
- * Note final arg to call_usermodehelper() is 0 - that means
- * don't wait. Since we are holding the global cpuset_sem here,
- * and we are asking another thread (started from keventd) to rmdir a
- * cpuset, we can't wait - or we'd deadlock with the removing thread
- * on cpuset_sem.
+ * The final arg to call_usermodehelper() is 0, which means don't
+ * wait. The separate /sbin/cpuset_release_agent task is forked by
+ * call_usermodehelper(), then control in this thread returns here,
+ * without waiting for the release agent task. We don't bother to
+ * wait because the caller of this routine has no use for the exit
+ * status of the /sbin/cpuset_release_agent task, so no sense holding
+ * our caller up for that.
+ *
+ * The simple act of forking that task might require more memory,
+ * which might need cpuset_sem. So this routine must be called while
+ * cpuset_sem is not held, to avoid a possible deadlock. See also
+ * comments for check_for_release(), below.
*/
-static int cpuset_release_agent(char *cpuset_str)
+static void cpuset_release_agent(const char *pathbuf)
{
char *argv[3], *envp[3];
int i;
+ if (!pathbuf)
+ return;
+
i = 0;
argv[i++] = "/sbin/cpuset_release_agent";
- argv[i++] = cpuset_str;
+ argv[i++] = (char *)pathbuf;
argv[i] = NULL;
i = 0;
@@ -421,17 +467,29 @@ static int cpuset_release_agent(char *cpuset_str)
envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[i] = NULL;
- return call_usermodehelper(argv[0], argv, envp, 0);
+ call_usermodehelper(argv[0], argv, envp, 0);
+ kfree(pathbuf);
}
/*
* Either cs->count of using tasks transitioned to zero, or the
* cs->children list of child cpusets just became empty. If this
* cs is notify_on_release() and now both the user count is zero and
- * the list of children is empty, send notice to user land.
+ * the list of children is empty, prepare cpuset path in a kmalloc'd
+ * buffer, to be returned via ppathbuf, so that the caller can invoke
+ * cpuset_release_agent() with it later on, once cpuset_sem is dropped.
+ * Call here with cpuset_sem held.
+ *
+ * This check_for_release() routine is responsible for kmalloc'ing
+ * pathbuf. The above cpuset_release_agent() is responsible for
+ * kfree'ing pathbuf. The caller of these routines is responsible
+ * for providing a pathbuf pointer, initialized to NULL, then
+ * calling check_for_release() with cpuset_sem held and the address
+ * of the pathbuf pointer, then dropping cpuset_sem, then calling
+ * cpuset_release_agent() with pathbuf, as set by check_for_release().
*/
-static void check_for_release(struct cpuset *cs)
+static void check_for_release(struct cpuset *cs, char **ppathbuf)
{
if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
list_empty(&cs->children)) {
@@ -441,10 +499,9 @@ static void check_for_release(struct cpuset *cs)
if (!buf)
return;
if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
- goto out;
- cpuset_release_agent(buf);
-out:
- kfree(buf);
+ kfree(buf);
+ else
+ *ppathbuf = buf;
}
}
@@ -501,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
* Refresh current tasks mems_allowed and mems_generation from
* current tasks cpuset. Call with cpuset_sem held.
*
- * Be sure to call refresh_mems() on any cpuset operation which
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
- * Call after obtaining cpuset_sem lock, before any possible
- * allocation. Otherwise one risks trying to allocate memory
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem. Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
+ * This routine is needed to update the per-task mems_allowed
+ * data, within the tasks context, when it is trying to allocate
+ * memory (in various mm/mempolicy.c routines) and notices
+ * that some other task has been modifying its cpuset.
*/
static void refresh_mems(void)
@@ -606,6 +654,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
* Call with cpuset_sem held. May nest a call to the
* lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
*/
+
static void update_cpu_domains(struct cpuset *cur)
{
struct cpuset *c, *par = cur->parent;
@@ -727,14 +776,14 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
return 0;
}
-static int attach_task(struct cpuset *cs, char *buf)
+static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
{
pid_t pid;
struct task_struct *tsk;
struct cpuset *oldcs;
cpumask_t cpus;
- if (sscanf(buf, "%d", &pid) != 1)
+ if (sscanf(pidbuf, "%d", &pid) != 1)
return -EIO;
if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC;
@@ -777,7 +826,7 @@ static int attach_task(struct cpuset *cs, char *buf)
put_task_struct(tsk);
if (atomic_dec_and_test(&oldcs->count))
- check_for_release(oldcs);
+ check_for_release(oldcs, ppathbuf);
return 0;
}
@@ -801,6 +850,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
struct cftype *cft = __d_cft(file->f_dentry);
cpuset_filetype_t type = cft->private;
char *buffer;
+ char *pathbuf = NULL;
int retval = 0;
/* Crude upper limit on largest legitimate cpulist user might write. */
@@ -817,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
}
buffer[nbytes] = 0; /* nul-terminate */
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
if (is_removed(cs)) {
retval = -ENODEV;
@@ -841,7 +891,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
break;
case FILE_TASKLIST:
- retval = attach_task(cs, buffer);
+ retval = attach_task(cs, buffer, &pathbuf);
break;
default:
retval = -EINVAL;
@@ -851,7 +901,8 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
if (retval == 0)
retval = nbytes;
out2:
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
+ cpuset_release_agent(pathbuf);
out1:
kfree(buffer);
return retval;
@@ -890,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
{
cpumask_t mask;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
mask = cs->cpus_allowed;
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return cpulist_scnprintf(page, PAGE_SIZE, mask);
}
@@ -901,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
nodemask_t mask;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
mask = cs->mems_allowed;
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return nodelist_scnprintf(page, PAGE_SIZE, mask);
}
@@ -948,6 +999,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
*s++ = '\n';
*s = '\0';
+ /* Do nothing if *ppos is at the eof or beyond the eof. */
+ if (s - page <= *ppos)
+ return 0;
+
start = page + *ppos;
n = s - start;
retval = n - copy_to_user(buf, start, min(n, nbytes));
@@ -1306,8 +1361,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
if (!cs)
return -ENOMEM;
- down(&cpuset_sem);
- refresh_mems();
+ cpuset_down(&cpuset_sem);
cs->flags = 0;
if (notify_on_release(parent))
set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1332,14 +1386,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
* will down() this new directory's i_sem and if we race with
* another mkdir, we might deadlock.
*/
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
err = cpuset_populate_dir(cs->dentry);
/* If err < 0, we have a half-filled directory - oh well ;) */
return 0;
err:
list_del(&cs->sibling);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
kfree(cs);
return err;
}
@@ -1357,17 +1411,17 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
struct cpuset *cs = dentry->d_fsdata;
struct dentry *d;
struct cpuset *parent;
+ char *pathbuf = NULL;
/* the vfs holds both inode->i_sem already */
- down(&cpuset_sem);
- refresh_mems();
+ cpuset_down(&cpuset_sem);
if (atomic_read(&cs->count) > 0) {
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return -EBUSY;
}
if (!list_empty(&cs->children)) {
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return -EBUSY;
}
parent = cs->parent;
@@ -1376,14 +1430,15 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
update_cpu_domains(cs);
list_del(&cs->sibling); /* delete my sibling from parent->children */
if (list_empty(&parent->children))
- check_for_release(parent);
+ check_for_release(parent, &pathbuf);
spin_lock(&cs->dentry->d_lock);
d = dget(cs->dentry);
cs->dentry = NULL;
spin_unlock(&d->d_lock);
cpuset_d_remove_dir(d);
dput(d);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
+ cpuset_release_agent(pathbuf);
return 0;
}
@@ -1440,10 +1495,10 @@ void __init cpuset_init_smp(void)
/**
* cpuset_fork - attach newly forked task to its parents cpuset.
- * @p: pointer to task_struct of forking parent process.
+ * @tsk: pointer to task_struct of forking parent process.
*
* Description: By default, on fork, a task inherits its
- * parents cpuset. The pointer to the shared cpuset is
+ * parent's cpuset. The pointer to the shared cpuset is
* automatically copied in fork.c by dup_task_struct().
* This cpuset_fork() routine need only increment the usage
* counter in that cpuset.
@@ -1471,7 +1526,6 @@ void cpuset_fork(struct task_struct *tsk)
* by the cpuset_sem semaphore. If you don't hold cpuset_sem,
* then a zero cpuset use count is a license to any other task to
* nuke the cpuset immediately.
- *
**/
void cpuset_exit(struct task_struct *tsk)
@@ -1484,10 +1538,13 @@ void cpuset_exit(struct task_struct *tsk)
task_unlock(tsk);
if (notify_on_release(cs)) {
- down(&cpuset_sem);
+ char *pathbuf = NULL;
+
+ cpuset_down(&cpuset_sem);
if (atomic_dec_and_test(&cs->count))
- check_for_release(cs);
- up(&cpuset_sem);
+ check_for_release(cs, &pathbuf);
+ cpuset_up(&cpuset_sem);
+ cpuset_release_agent(pathbuf);
} else {
atomic_dec(&cs->count);
}
@@ -1507,11 +1564,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
{
cpumask_t mask;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
task_lock((struct task_struct *)tsk);
guarantee_online_cpus(tsk->cpuset, &mask);
task_unlock((struct task_struct *)tsk);
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
return mask;
}
@@ -1521,7 +1578,9 @@ void cpuset_init_current_mems_allowed(void)
current->mems_allowed = NODE_MASK_ALL;
}
-/*
+/**
+ * cpuset_update_current_mems_allowed - update mems parameters to new values
+ *
* If the current tasks cpusets mems_allowed changed behind our backs,
* update current->mems_allowed and mems_generation to the new value.
* Do not call this routine if in_interrupt().
@@ -1534,19 +1593,26 @@ void cpuset_update_current_mems_allowed(void)
if (!cs)
return; /* task is exiting */
if (current->cpuset_mems_generation != cs->mems_generation) {
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
refresh_mems();
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
}
}
+/**
+ * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
+ * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
+ */
void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
{
bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
MAX_NUMNODES);
}
-/*
+/**
+ * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
+ * @zl: the zonelist to be checked
+ *
* Are any of the nodes on zonelist zl allowed in current->mems_allowed?
*/
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
@@ -1563,12 +1629,113 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
}
/*
- * Is 'current' valid, and is zone z allowed in current->mems_allowed?
+ * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
+ * ancestor to the specified cpuset. Call while holding cpuset_sem.
+ * If no ancestor is mem_exclusive (an unusual configuration), then
+ * returns the root cpuset.
*/
-int cpuset_zone_allowed(struct zone *z)
+static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
+{
+ while (!is_mem_exclusive(cs) && cs->parent)
+ cs = cs->parent;
+ return cs;
+}
+
+/**
+ * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
+ *
+ * If we're in interrupt, yes, we can always allocate. If zone
+ * z's node is in our tasks mems_allowed, yes. If it's not a
+ * __GFP_HARDWALL request and this zone's nodes is in the nearest
+ * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * Otherwise, no.
+ *
+ * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
+ * and do not allow allocations outside the current tasks cpuset.
+ * GFP_KERNEL allocations are not so marked, so can escape to the
+ * nearest mem_exclusive ancestor cpuset.
+ *
+ * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages()
+ * routine only calls here with __GFP_HARDWALL bit _not_ set if
+ * it's a GFP_KERNEL allocation, and all nodes in the current tasks
+ * mems_allowed came up empty on the first pass over the zonelist.
+ * So only GFP_KERNEL allocations, if all nodes in the cpuset are
+ * short of memory, might require taking the cpuset_sem semaphore.
+ *
+ * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
+ * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
+ * hardwall cpusets - no allocation on a node outside the cpuset is
+ * allowed (unless in interrupt, of course).
+ *
+ * The second loop doesn't even call here for GFP_ATOMIC requests
+ * (if the __alloc_pages() local variable 'wait' is set). That check
+ * and the checks below have the combined affect in the second loop of
+ * the __alloc_pages() routine that:
+ * in_interrupt - any node ok (current task context irrelevant)
+ * GFP_ATOMIC - any node ok
+ * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
+ * GFP_USER - only nodes in current tasks mems allowed ok.
+ **/
+
+int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
+{
+ int node; /* node that zone z is on */
+ const struct cpuset *cs; /* current cpuset ancestors */
+ int allowed = 1; /* is allocation in zone z allowed? */
+
+ if (in_interrupt())
+ return 1;
+ node = z->zone_pgdat->node_id;
+ if (node_isset(node, current->mems_allowed))
+ return 1;
+ if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
+ return 0;
+
+ /* Not hardwall and node outside mems_allowed: scan up cpusets */
+ cpuset_down(&cpuset_sem);
+ cs = current->cpuset;
+ if (!cs)
+ goto done; /* current task exiting */
+ cs = nearest_exclusive_ancestor(cs);
+ allowed = node_isset(node, cs->mems_allowed);
+done:
+ cpuset_up(&cpuset_sem);
+ return allowed;
+}
+
+/**
+ * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
+ * @p: pointer to task_struct of some other task.
+ *
+ * Description: Return true if the nearest mem_exclusive ancestor
+ * cpusets of tasks @p and current overlap. Used by oom killer to
+ * determine if task @p's memory usage might impact the memory
+ * available to the current task.
+ *
+ * Acquires cpuset_sem - not suitable for calling from a fast path.
+ **/
+
+int cpuset_excl_nodes_overlap(const struct task_struct *p)
{
- return in_interrupt() ||
- node_isset(z->zone_pgdat->node_id, current->mems_allowed);
+ const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
+ int overlap = 0; /* do cpusets overlap? */
+
+ cpuset_down(&cpuset_sem);
+ cs1 = current->cpuset;
+ if (!cs1)
+ goto done; /* current task exiting */
+ cs2 = p->cpuset;
+ if (!cs2)
+ goto done; /* task p is exiting */
+ cs1 = nearest_exclusive_ancestor(cs1);
+ cs2 = nearest_exclusive_ancestor(cs2);
+ overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
+done:
+ cpuset_up(&cpuset_sem);
+
+ return overlap;
}
/*
@@ -1589,7 +1756,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
return -ENOMEM;
tsk = m->private;
- down(&cpuset_sem);
+ cpuset_down(&cpuset_sem);
task_lock(tsk);
cs = tsk->cpuset;
task_unlock(tsk);
@@ -1604,7 +1771,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
seq_puts(m, buf);
seq_putc(m, '\n');
out:
- up(&cpuset_sem);
+ cpuset_up(&cpuset_sem);
kfree(buf);
return retval;
}
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 459ba49e376a..334c37f5218a 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -18,7 +18,16 @@
/* Stores the physical address of elf header of crash image. */
unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
-/*
+/**
+ * copy_oldmem_page - copy one page from "oldmem"
+ * @pfn: page frame number to be copied
+ * @buf: target memory address for the copy; this can be in kernel address
+ * space or user address space (see @userbuf)
+ * @csize: number of bytes to copy
+ * @offset: offset in bytes into the page (based on pfn) to begin the copy
+ * @userbuf: if set, @buf is in user address space, use copy_to_user(),
+ * otherwise @buf is in kernel address space, use memcpy().
+ *
* Copy a page from "oldmem". For this page, there is no pte mapped
* in the current kernel. We stitch up a pte, similar to kmap_atomic.
*/
diff --git a/kernel/exit.c b/kernel/exit.c
index 9d1b10ed0135..6d2089a1bce7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize);
static inline void close_files(struct files_struct * files)
{
int i, j;
+ struct fdtable *fdt;
j = 0;
+ fdt = files_fdtable(files);
for (;;) {
unsigned long set;
i = j * __NFDBITS;
- if (i >= files->max_fdset || i >= files->max_fds)
+ if (i >= fdt->max_fdset || i >= fdt->max_fds)
break;
- set = files->open_fds->fds_bits[j++];
+ set = fdt->open_fds->fds_bits[j++];
while (set) {
if (set & 1) {
- struct file * file = xchg(&files->fd[i], NULL);
+ struct file * file = xchg(&fdt->fd[i], NULL);
if (file)
filp_close(file, files);
}
@@ -403,18 +405,22 @@ struct files_struct *get_files_struct(struct task_struct *task)
void fastcall put_files_struct(struct files_struct *files)
{
+ struct fdtable *fdt;
+
if (atomic_dec_and_test(&files->count)) {
close_files(files);
/*
* Free the fd and fdset arrays if we expanded them.
+ * If the fdtable was embedded, pass files for freeing
+ * at the end of the RCU grace period. Otherwise,
+ * you can free files immediately.
*/
- if (files->fd != &files->fd_array[0])
- free_fd_array(files->fd, files->max_fds);
- if (files->max_fdset > __FD_SETSIZE) {
- free_fdset(files->open_fds, files->max_fdset);
- free_fdset(files->close_on_exec, files->max_fdset);
- }
- kmem_cache_free(files_cachep, files);
+ fdt = files_fdtable(files);
+ if (fdt == &files->fdtab)
+ fdt->free_files = files;
+ else
+ kmem_cache_free(files_cachep, files);
+ free_fdtable(fdt);
}
}
@@ -829,8 +835,10 @@ fastcall NORET_TYPE void do_exit(long code)
acct_update_integrals(tsk);
update_mem_hiwater(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
- if (group_dead)
+ if (group_dead) {
+ del_timer_sync(&tsk->signal->real_timer);
acct_process(code);
+ }
exit_mm(tsk);
exit_sem(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index cdef6cea8900..8149f3602881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
#include <linux/syscalls.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
+#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
@@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
/* One for us, one for whoever does the "release_task()" (usually parent) */
atomic_set(&tsk->usage,2);
+ atomic_set(&tsk->fs_excl, 0);
return tsk;
}
@@ -208,8 +210,10 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
struct file *file;
if (mpnt->vm_flags & VM_DONTCOPY) {
+ long pages = vma_pages(mpnt);
+ mm->total_vm -= pages;
__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
- -vma_pages(mpnt));
+ -pages);
continue;
}
charge = 0;
@@ -562,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
return 0;
}
-static int count_open_files(struct files_struct *files, int size)
+static int count_open_files(struct fdtable *fdt)
{
+ int size = fdt->max_fdset;
int i;
/* Find the last open fd */
for (i = size/(8*sizeof(long)); i > 0; ) {
- if (files->open_fds->fds_bits[--i])
+ if (fdt->open_fds->fds_bits[--i])
break;
}
i = (i+1) * 8 * sizeof(long);
return i;
}
+static struct files_struct *alloc_files(void)
+{
+ struct files_struct *newf;
+ struct fdtable *fdt;
+
+ newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
+ if (!newf)
+ goto out;
+
+ atomic_set(&newf->count, 1);
+
+ spin_lock_init(&newf->file_lock);
+ fdt = &newf->fdtab;
+ fdt->next_fd = 0;
+ fdt->max_fds = NR_OPEN_DEFAULT;
+ fdt->max_fdset = __FD_SETSIZE;
+ fdt->close_on_exec = &newf->close_on_exec_init;
+ fdt->open_fds = &newf->open_fds_init;
+ fdt->fd = &newf->fd_array[0];
+ INIT_RCU_HEAD(&fdt->rcu);
+ fdt->free_files = NULL;
+ fdt->next = NULL;
+ rcu_assign_pointer(newf->fdt, fdt);
+out:
+ return newf;
+}
+
static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
{
struct files_struct *oldf, *newf;
struct file **old_fds, **new_fds;
int open_files, size, i, error = 0, expand;
+ struct fdtable *old_fdt, *new_fdt;
/*
* A background process may not have any files ...
@@ -600,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
*/
tsk->files = NULL;
error = -ENOMEM;
- newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
- if (!newf)
+ newf = alloc_files();
+ if (!newf)
goto out;
- atomic_set(&newf->count, 1);
-
- spin_lock_init(&newf->file_lock);
- newf->next_fd = 0;
- newf->max_fds = NR_OPEN_DEFAULT;
- newf->max_fdset = __FD_SETSIZE;
- newf->close_on_exec = &newf->close_on_exec_init;
- newf->open_fds = &newf->open_fds_init;
- newf->fd = &newf->fd_array[0];
-
spin_lock(&oldf->file_lock);
-
- open_files = count_open_files(oldf, oldf->max_fdset);
+ old_fdt = files_fdtable(oldf);
+ new_fdt = files_fdtable(newf);
+ size = old_fdt->max_fdset;
+ open_files = count_open_files(old_fdt);
expand = 0;
/*
* Check whether we need to allocate a larger fd array or fd set.
* Note: we're not a clone task, so the open count won't change.
*/
- if (open_files > newf->max_fdset) {
- newf->max_fdset = 0;
+ if (open_files > new_fdt->max_fdset) {
+ new_fdt->max_fdset = 0;
expand = 1;
}
- if (open_files > newf->max_fds) {
- newf->max_fds = 0;
+ if (open_files > new_fdt->max_fds) {
+ new_fdt->max_fds = 0;
expand = 1;
}
@@ -640,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
spin_unlock(&newf->file_lock);
if (error < 0)
goto out_release;
+ new_fdt = files_fdtable(newf);
+ /*
+ * Reacquire the oldf lock and a pointer to its fd table
+ * who knows it may have a new bigger fd table. We need
+ * the latest pointer.
+ */
spin_lock(&oldf->file_lock);
+ old_fdt = files_fdtable(oldf);
}
- old_fds = oldf->fd;
- new_fds = newf->fd;
+ old_fds = old_fdt->fd;
+ new_fds = new_fdt->fd;
- memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
- memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
+ memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
+ memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
@@ -660,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
* is partway through open(). So make sure that this
* fd is available to the new process.
*/
- FD_CLR(open_files - i, newf->open_fds);
+ FD_CLR(open_files - i, new_fdt->open_fds);
}
- *new_fds++ = f;
+ rcu_assign_pointer(*new_fds++, f);
}
spin_unlock(&oldf->file_lock);
/* compute the remainder to be cleared */
- size = (newf->max_fds - open_files) * sizeof(struct file *);
+ size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
/* This is long word aligned thus could use a optimized version */
memset(new_fds, 0, size);
- if (newf->max_fdset > open_files) {
- int left = (newf->max_fdset-open_files)/8;
+ if (new_fdt->max_fdset > open_files) {
+ int left = (new_fdt->max_fdset-open_files)/8;
int start = open_files / (8 * sizeof(unsigned long));
- memset(&newf->open_fds->fds_bits[start], 0, left);
- memset(&newf->close_on_exec->fds_bits[start], 0, left);
+ memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+ memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
}
tsk->files = newf;
@@ -686,9 +718,9 @@ out:
return error;
out_release:
- free_fdset (newf->close_on_exec, newf->max_fdset);
- free_fdset (newf->open_fds, newf->max_fdset);
- free_fd_array(newf->fd, newf->max_fds);
+ free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
+ free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
+ free_fd_array(new_fdt->fd, new_fdt->max_fds);
kmem_cache_free(files_cachep, newf);
goto out;
}
@@ -992,6 +1024,9 @@ static task_t *copy_process(unsigned long clone_flags,
* of CLONE_PTRACE.
*/
clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+ clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
/* Our parent execution domain becomes current domain
These must match for thread signalling to apply */
@@ -1110,6 +1145,9 @@ static task_t *copy_process(unsigned long clone_flags,
__get_cpu_var(process_counts)++;
}
+ if (!current->signal->tty && p->signal->tty)
+ p->signal->tty = NULL;
+
nr_threads++;
total_forks++;
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..ca05fe6a70b2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
+#include <asm/futex.h>
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
@@ -327,6 +328,118 @@ out:
}
/*
+ * Wake up all waiters hashed on the physical page that is mapped
+ * to this virtual address:
+ */
+static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
+{
+ union futex_key key1, key2;
+ struct futex_hash_bucket *bh1, *bh2;
+ struct list_head *head;
+ struct futex_q *this, *next;
+ int ret, op_ret, attempt = 0;
+
+retryfull:
+ down_read(&current->mm->mmap_sem);
+
+ ret = get_futex_key(uaddr1, &key1);
+ if (unlikely(ret != 0))
+ goto out;
+ ret = get_futex_key(uaddr2, &key2);
+ if (unlikely(ret != 0))
+ goto out;
+
+ bh1 = hash_futex(&key1);
+ bh2 = hash_futex(&key2);
+
+retry:
+ if (bh1 < bh2)
+ spin_lock(&bh1->lock);
+ spin_lock(&bh2->lock);
+ if (bh1 > bh2)
+ spin_lock(&bh1->lock);
+
+ op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
+ if (unlikely(op_ret < 0)) {
+ int dummy;
+
+ spin_unlock(&bh1->lock);
+ if (bh1 != bh2)
+ spin_unlock(&bh2->lock);
+
+ /* futex_atomic_op_inuser needs to both read and write
+ * *(int __user *)uaddr2, but we can't modify it
+ * non-atomically. Therefore, if get_user below is not
+ * enough, we need to handle the fault ourselves, while
+ * still holding the mmap_sem. */
+ if (attempt++) {
+ struct vm_area_struct * vma;
+ struct mm_struct *mm = current->mm;
+
+ ret = -EFAULT;
+ if (attempt >= 2 ||
+ !(vma = find_vma(mm, uaddr2)) ||
+ vma->vm_start > uaddr2 ||
+ !(vma->vm_flags & VM_WRITE))
+ goto out;
+
+ switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
+ case VM_FAULT_MINOR:
+ current->min_flt++;
+ break;
+ case VM_FAULT_MAJOR:
+ current->maj_flt++;
+ break;
+ default:
+ goto out;
+ }
+ goto retry;
+ }
+
+ /* If we would have faulted, release mmap_sem,
+ * fault it in and start all over again. */
+ up_read(&current->mm->mmap_sem);
+
+ ret = get_user(dummy, (int __user *)uaddr2);
+ if (ret)
+ return ret;
+
+ goto retryfull;
+ }
+
+ head = &bh1->chain;
+
+ list_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, &key1)) {
+ wake_futex(this);
+ if (++ret >= nr_wake)
+ break;
+ }
+ }
+
+ if (op_ret > 0) {
+ head = &bh2->chain;
+
+ op_ret = 0;
+ list_for_each_entry_safe(this, next, head, list) {
+ if (match_futex (&this->key, &key2)) {
+ wake_futex(this);
+ if (++op_ret >= nr_wake2)
+ break;
+ }
+ }
+ ret += op_ret;
+ }
+
+ spin_unlock(&bh1->lock);
+ if (bh1 != bh2)
+ spin_unlock(&bh2->lock);
+out:
+ up_read(&current->mm->mmap_sem);
+ return ret;
+}
+
+/*
* Requeue all waiters hashed on one physical page to another
* physical page.
*/
@@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal)
filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
if (signal) {
- int err;
err = f_setown(filp, current->pid, 1);
if (err < 0) {
- put_unused_fd(ret);
- put_filp(filp);
- ret = err;
- goto out;
+ goto error;
}
filp->f_owner.signum = signal;
}
q = kmalloc(sizeof(*q), GFP_KERNEL);
if (!q) {
- put_unused_fd(ret);
- put_filp(filp);
- ret = -ENOMEM;
- goto out;
+ err = -ENOMEM;
+ goto error;
}
down_read(&current->mm->mmap_sem);
@@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal)
if (unlikely(err != 0)) {
up_read(&current->mm->mmap_sem);
- put_unused_fd(ret);
- put_filp(filp);
kfree(q);
- return err;
+ goto error;
}
/*
@@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal)
fd_install(ret, filp);
out:
return ret;
+error:
+ put_unused_fd(ret);
+ put_filp(filp);
+ ret = err;
+ goto out;
}
long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
@@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
case FUTEX_CMP_REQUEUE:
ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
break;
+ case FUTEX_WAKE_OP:
+ ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
+ break;
default:
ret = -ENOSYS;
}
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
struct list_head *tmp;
struct inter_module_entry *ime, *ime_new;
- if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) {
+ if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
/* Overloaded kernel, not fatal */
printk(KERN_ERR
"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
kmalloc_failed = 1;
return;
}
- memset(ime_new, 0, sizeof(*ime_new));
ime_new->im_name = im_name;
ime_new->owner = owner;
ime_new->userdata = userdata;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..3ff7b925c387 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
unsigned int status;
kstat_this_cpu.irqs[irq]++;
- if (desc->status & IRQ_PER_CPU) {
+ if (CHECK_IRQ_PER_CPU(desc->status)) {
irqreturn_t action_ret;
/*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
+
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
*/
static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
-void __attribute__((weak))
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+ /*
+ * Save these away for later use. Re-progam when the
+ * interrupt is pending
+ */
+ set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
{
irq_affinity[irq] = mask_val;
irq_desc[irq].handler->set_affinity(irq, mask_val);
}
+#endif
static int irq_affinity_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
diff --git a/kernel/itimer.c b/kernel/itimer.c
index a72cb0e5aa4b..7c1b25e25e47 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -112,28 +112,11 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value)
return error;
}
-/*
- * Called with P->sighand->siglock held and P->signal->real_timer inactive.
- * If interval is nonzero, arm the timer for interval ticks from now.
- */
-static inline void it_real_arm(struct task_struct *p, unsigned long interval)
-{
- p->signal->it_real_value = interval; /* XXX unnecessary field?? */
- if (interval == 0)
- return;
- if (interval > (unsigned long) LONG_MAX)
- interval = LONG_MAX;
- /* the "+ 1" below makes sure that the timer doesn't go off before
- * the interval requested. This could happen if
- * time requested % (usecs per jiffy) is more than the usecs left
- * in the current jiffy */
- p->signal->real_timer.expires = jiffies + interval + 1;
- add_timer(&p->signal->real_timer);
-}
void it_real_fn(unsigned long __data)
{
struct task_struct * p = (struct task_struct *) __data;
+ unsigned long inc = p->signal->it_real_incr;
send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
@@ -141,14 +124,23 @@ void it_real_fn(unsigned long __data)
* Now restart the timer if necessary. We don't need any locking
* here because do_setitimer makes sure we have finished running
* before it touches anything.
+ * Note, we KNOW we are (or should be) at a jiffie edge here so
+ * we don't need the +1 stuff. Also, we want to use the prior
+ * expire value so as to not "slip" a jiffie if we are late.
+ * Deal with requesting a time prior to "now" here rather than
+ * in add_timer.
*/
- it_real_arm(p, p->signal->it_real_incr);
+ if (!inc)
+ return;
+ while (time_before_eq(p->signal->real_timer.expires, jiffies))
+ p->signal->real_timer.expires += inc;
+ add_timer(&p->signal->real_timer);
}
int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
{
struct task_struct *tsk = current;
- unsigned long val, interval;
+ unsigned long val, interval, expires;
cputime_t cval, cinterval, nval, ninterval;
switch (which) {
@@ -164,7 +156,10 @@ again:
}
tsk->signal->it_real_incr =
timeval_to_jiffies(&value->it_interval);
- it_real_arm(tsk, timeval_to_jiffies(&value->it_value));
+ expires = timeval_to_jiffies(&value->it_value);
+ if (expires)
+ mod_timer(&tsk->signal->real_timer,
+ jiffies + 1 + expires);
spin_unlock_irq(&tsk->sighand->siglock);
if (ovalue) {
jiffies_to_timeval(val, &ovalue->it_value);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
+#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
#include <asm/kdebug.h>
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages;
* get_insn_slot() - Find a slot on an executable page for an instruction.
* We allocate an executable page if there's no room on existing ones.
*/
-kprobe_opcode_t *get_insn_slot(void)
+kprobe_opcode_t __kprobes *get_insn_slot(void)
{
struct kprobe_insn_page *kip;
struct hlist_node *pos;
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void)
return kip->insns;
}
-void free_insn_slot(kprobe_opcode_t *slot)
+void __kprobes free_insn_slot(kprobe_opcode_t *slot)
{
struct kprobe_insn_page *kip;
struct hlist_node *pos;
@@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot)
}
/* Locks kprobe: irqs must be disabled */
-void lock_kprobes(void)
+void __kprobes lock_kprobes(void)
{
+ unsigned long flags = 0;
+
+ /* Avoiding local interrupts to happen right after we take the kprobe_lock
+ * and before we get a chance to update kprobe_cpu, this to prevent
+ * deadlock when we have a kprobe on ISR routine and a kprobe on task
+ * routine
+ */
+ local_irq_save(flags);
+
spin_lock(&kprobe_lock);
kprobe_cpu = smp_processor_id();
+
+ local_irq_restore(flags);
}
-void unlock_kprobes(void)
+void __kprobes unlock_kprobes(void)
{
+ unsigned long flags = 0;
+
+ /* Avoiding local interrupts to happen right after we update
+ * kprobe_cpu and before we get a a chance to release kprobe_lock,
+ * this to prevent deadlock when we have a kprobe on ISR routine and
+ * a kprobe on task routine
+ */
+ local_irq_save(flags);
+
kprobe_cpu = NR_CPUS;
spin_unlock(&kprobe_lock);
+
+ local_irq_restore(flags);
}
/* You have to be holding the kprobe_lock */
-struct kprobe *get_kprobe(void *addr)
+struct kprobe __kprobes *get_kprobe(void *addr)
{
struct hlist_head *head;
struct hlist_node *node;
@@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr)
* Aggregate handlers for multiple kprobes support - these handlers
* take care of invoking the individual kprobe handlers on p->list
*/
-static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
@@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
-static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
+static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
{
struct kprobe *kp;
@@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
return;
}
-static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
- int trapnr)
+static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+ int trapnr)
{
/*
* if we faulted "during" the execution of a user specified
@@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
return 0;
}
-static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp = curr_kprobe;
if (curr_kprobe && kp->break_handler) {
@@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
return 0;
}
-struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
{
struct hlist_node *node;
struct kretprobe_instance *ri;
@@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
return NULL;
}
-static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
+ *rp)
{
struct hlist_node *node;
struct kretprobe_instance *ri;
@@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
return NULL;
}
-void add_rp_inst(struct kretprobe_instance *ri)
+void __kprobes add_rp_inst(struct kretprobe_instance *ri)
{
/*
* Remove rp inst off the free list -
@@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
hlist_add_head(&ri->uflist, &ri->rp->used_instances);
}
-void recycle_rp_inst(struct kretprobe_instance *ri)
+void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
{
/* remove rp inst off the rprobe_inst_table */
hlist_del(&ri->hlist);
@@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
kfree(ri);
}
-struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
{
return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
}
@@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
* instances associated with this task. These left over instances represent
* probed functions that have been called but will never return.
*/
-void kprobe_flush_task(struct task_struct *tk)
+void __kprobes kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
struct hlist_head *head;
@@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk)
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
*/
-static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+ struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
@@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
* Add the new probe to old_p->list. Fail if this is the
* second jprobe at the address - two jprobes can't coexist
*/
-static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
{
struct kprobe *kp;
@@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
* the intricacies
* TODO: Move kcalloc outside the spinlock
*/
-static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
+ struct kprobe *p)
{
int ret = 0;
struct kprobe *ap;
@@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
spin_unlock_irqrestore(&kprobe_lock, flags);
}
-int register_kprobe(struct kprobe *p)
+static int __kprobes in_kprobes_functions(unsigned long addr)
+{
+ if (addr >= (unsigned long)__kprobes_text_start
+ && addr < (unsigned long)__kprobes_text_end)
+ return -EINVAL;
+ return 0;
+}
+
+int __kprobes register_kprobe(struct kprobe *p)
{
int ret = 0;
unsigned long flags = 0;
struct kprobe *old_p;
- if ((ret = arch_prepare_kprobe(p)) != 0) {
+ if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
+ return ret;
+ if ((ret = arch_prepare_kprobe(p)) != 0)
goto rm_kprobe;
- }
+
spin_lock_irqsave(&kprobe_lock, flags);
old_p = get_kprobe(p->addr);
p->nmissed = 0;
@@ -466,7 +502,7 @@ rm_kprobe:
return ret;
}
-void unregister_kprobe(struct kprobe *p)
+void __kprobes unregister_kprobe(struct kprobe *p)
{
unsigned long flags;
struct kprobe *old_p;
@@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = {
.priority = 0x7fffffff /* we need to notified first */
};
-int register_jprobe(struct jprobe *jp)
+int __kprobes register_jprobe(struct jprobe *jp)
{
/* Todo: Verify probepoint is a function entry point */
jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp)
return register_kprobe(&jp->kp);
}
-void unregister_jprobe(struct jprobe *jp)
+void __kprobes unregister_jprobe(struct jprobe *jp)
{
unregister_kprobe(&jp->kp);
}
#ifdef ARCH_SUPPORTS_KRETPROBES
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
struct kretprobe_instance *inst;
@@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp)
#else /* ARCH_SUPPORTS_KRETPROBES */
-int register_kretprobe(struct kretprobe *rp)
+int __kprobes register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
}
#endif /* ARCH_SUPPORTS_KRETPROBES */
-void unregister_kretprobe(struct kretprobe *rp)
+void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
unsigned long flags;
struct kretprobe_instance *ri;
diff --git a/kernel/module.c b/kernel/module.c
index 068e271ab3a5..ff5c500ab625 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,6 +20,7 @@
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/elf.h>
@@ -250,13 +251,18 @@ static inline unsigned int block_size(int val)
/* Created by linker magic */
extern char __per_cpu_start[], __per_cpu_end[];
-static void *percpu_modalloc(unsigned long size, unsigned long align)
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+ const char *name)
{
unsigned long extra;
unsigned int i;
void *ptr;
- BUG_ON(align > SMP_CACHE_BYTES);
+ if (align > SMP_CACHE_BYTES) {
+ printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n",
+ name, align, SMP_CACHE_BYTES);
+ align = SMP_CACHE_BYTES;
+ }
ptr = __per_cpu_start;
for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -348,7 +354,8 @@ static int percpu_modinit(void)
}
__initcall(percpu_modinit);
#else /* ... !CONFIG_SMP */
-static inline void *percpu_modalloc(unsigned long size, unsigned long align)
+static inline void *percpu_modalloc(unsigned long size, unsigned long align,
+ const char *name)
{
return NULL;
}
@@ -492,7 +499,7 @@ static inline int try_force(unsigned int flags)
{
int ret = (flags & O_TRUNC);
if (ret)
- tainted |= TAINT_FORCED_MODULE;
+ add_taint(TAINT_FORCED_MODULE);
return ret;
}
#else
@@ -891,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs,
if (!(tainted & TAINT_FORCED_MODULE)) {
printk("%s: no version for \"%s\" found: kernel tainted.\n",
mod->name, symname);
- tainted |= TAINT_FORCED_MODULE;
+ add_taint(TAINT_FORCED_MODULE);
}
return 1;
}
@@ -1346,7 +1353,7 @@ static void set_license(struct module *mod, const char *license)
if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
mod->name, license);
- tainted |= TAINT_PROPRIETARY_MODULE;
+ add_taint(TAINT_PROPRIETARY_MODULE);
}
}
@@ -1503,6 +1510,7 @@ static struct module *load_module(void __user *umod,
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
struct exception_table_entry *extable;
+ mm_segment_t old_fs;
DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
umod, len, uargs);
@@ -1603,7 +1611,7 @@ static struct module *load_module(void __user *umod,
modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
- tainted |= TAINT_FORCED_MODULE;
+ add_taint(TAINT_FORCED_MODULE);
printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
mod->name);
} else if (!same_magic(modmagic, vermagic)) {
@@ -1644,7 +1652,8 @@ static struct module *load_module(void __user *umod,
if (pcpuindex) {
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
- sechdrs[pcpuindex].sh_addralign);
+ sechdrs[pcpuindex].sh_addralign,
+ mod->name);
if (!percpu) {
err = -ENOMEM;
goto free_mod;
@@ -1731,7 +1740,7 @@ static struct module *load_module(void __user *umod,
(mod->num_gpl_syms && !gplcrcindex)) {
printk(KERN_WARNING "%s: No versions for exported symbols."
" Tainting kernel.\n", mod->name);
- tainted |= TAINT_FORCED_MODULE;
+ add_taint(TAINT_FORCED_MODULE);
}
#endif
@@ -1772,6 +1781,24 @@ static struct module *load_module(void __user *umod,
if (err < 0)
goto cleanup;
+ /* flush the icache in correct context */
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+
+ /*
+ * Flush the instruction cache, since we've played with text.
+ * Do it before processing of module parameters, so the module
+ * can provide parameter accessor functions of its own.
+ */
+ if (mod->module_init)
+ flush_icache_range((unsigned long)mod->module_init,
+ (unsigned long)mod->module_init
+ + mod->init_size);
+ flush_icache_range((unsigned long)mod->module_core,
+ (unsigned long)mod->module_core + mod->core_size);
+
+ set_fs(old_fs);
+
mod->args = args;
if (obsparmindex) {
err = obsolete_params(mod->name, mod->args,
@@ -1853,7 +1880,6 @@ sys_init_module(void __user *umod,
const char __user *uargs)
{
struct module *mod;
- mm_segment_t old_fs = get_fs();
int ret = 0;
/* Must have permission */
@@ -1871,19 +1897,6 @@ sys_init_module(void __user *umod,
return PTR_ERR(mod);
}
- /* flush the icache in correct context */
- set_fs(KERNEL_DS);
-
- /* Flush the instruction cache, since we've played with text */
- if (mod->module_init)
- flush_icache_range((unsigned long)mod->module_init,
- (unsigned long)mod->module_init
- + mod->init_size);
- flush_icache_range((unsigned long)mod->module_core,
- (unsigned long)mod->module_core + mod->core_size);
-
- set_fs(old_fs);
-
/* Now sew it into the lists. They won't access us, since
strong_try_module_get() will fail. */
stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/panic.c b/kernel/panic.c
index 74ba5f3e46c7..aabc5f86fa3f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -111,12 +111,11 @@ NORET_TYPE void panic(const char * fmt, ...)
mdelay(1);
i++;
}
- /*
- * Should we run the reboot notifier. For the moment Im
- * choosing not too. It might crash, be corrupt or do
- * more harm than good for other reasons.
+ /* This will not be a clean reboot, with everything
+ * shutting down. But if there is a chance of
+ * rebooting the system it will be rebooted.
*/
- machine_restart(NULL);
+ emergency_restart();
}
#ifdef __sparc__
{
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..fbf173215fd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
{
struct module_kobject *mk;
- mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL);
- memset(mk, 0, sizeof(struct module_kobject));
+ mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
+ BUG_ON(!mk);
mk->mod = THIS_MODULE;
kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 5b7b4736d82b..b7b532acd9fc 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
timr->sigq->info.si_code = SI_TIMER;
timr->sigq->info.si_tid = timr->it_id;
timr->sigq->info.si_value = timr->it_sigev_value;
+
if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
- if (unlikely(timr->it_process->flags & PF_EXITING)) {
- timr->it_sigev_notify = SIGEV_SIGNAL;
- put_task_struct(timr->it_process);
- timr->it_process = timr->it_process->group_leader;
- goto group;
- }
- return send_sigqueue(timr->it_sigev_signo, timr->sigq,
- timr->it_process);
- }
- else {
- group:
- return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
- timr->it_process);
+ struct task_struct *leader;
+ int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
+ timr->it_process);
+
+ if (likely(ret >= 0))
+ return ret;
+
+ timr->it_sigev_notify = SIGEV_SIGNAL;
+ leader = timr->it_process->group_leader;
+ put_task_struct(timr->it_process);
+ timr->it_process = leader;
}
+
+ return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
+ timr->it_process);
}
EXPORT_SYMBOL_GPL(posix_timer_event);
@@ -896,21 +898,10 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp,
jiffies_64_f = get_jiffies_64();
}
/*
- * Take away now to get delta
- */
- oc.tv_sec -= now.tv_sec;
- oc.tv_nsec -= now.tv_nsec;
- /*
- * Normalize...
+ * Take away now to get delta and normalize
*/
- while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) {
- oc.tv_nsec -= NSEC_PER_SEC;
- oc.tv_sec++;
- }
- while ((oc.tv_nsec) < 0) {
- oc.tv_nsec += NSEC_PER_SEC;
- oc.tv_sec--;
- }
+ set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
+ oc.tv_nsec - now.tv_nsec);
}else{
jiffies_64_f = get_jiffies_64();
}
@@ -1177,7 +1168,6 @@ void exit_itimers(struct signal_struct *sig)
tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
itimer_delete(tmr);
}
- del_timer_sync(&sig->real_timer);
}
/*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2c7121d9bff1..396c7873e804 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,5 +1,6 @@
config PM
bool "Power Management support"
+ depends on !IA64_HP_SIM
---help---
"Power Management" means that parts of your computer are shut
off or put into a power conserving "sleep" mode if they are not
@@ -28,7 +29,7 @@ config PM_DEBUG
config SOFTWARE_SUSPEND
bool "Software Suspend"
- depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
+ depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP))
---help---
Enable the possibility of suspending the machine.
It doesn't need APM.
@@ -72,6 +73,18 @@ config PM_STD_PARTITION
suspended image to. It will simply pick the first available swap
device.
+config SWSUSP_ENCRYPT
+ bool "Encrypt suspend image"
+ depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
+ default ""
+ ---help---
+ To prevent data gathering from swap after resume you can encrypt
+ the suspend image with a temporary key that is deleted on
+ resume.
+
+ Note that the temporary key is stored unencrypted on disk while the
+ system is suspended.
+
config SUSPEND_SMP
bool
depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index fb8de63c2919..2d8bf054d036 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -16,6 +16,8 @@
#include <linux/device.h>
#include <linux/delay.h>
#include <linux/fs.h>
+#include <linux/mount.h>
+
#include "power.h"
@@ -57,16 +59,13 @@ static void power_down(suspend_disk_method_t mode)
error = pm_ops->enter(PM_SUSPEND_DISK);
break;
case PM_DISK_SHUTDOWN:
- printk("Powering off system\n");
- device_shutdown();
- machine_power_off();
+ kernel_power_off();
break;
case PM_DISK_REBOOT:
- device_shutdown();
- machine_restart(NULL);
+ kernel_restart(NULL);
break;
}
- machine_halt();
+ kernel_halt();
/* Valid image is on the disk, if we continue we risk serious data corruption
after resume. */
printk(KERN_CRIT "Please power me down manually\n");
@@ -113,24 +112,12 @@ static inline void platform_finish(void)
}
}
-static void finish(void)
-{
- device_resume();
- platform_finish();
- thaw_processes();
- enable_nonboot_cpus();
- pm_restore_console();
-}
-
-
static int prepare_processes(void)
{
int error;
pm_prepare_console();
-
sys_sync();
-
disable_nonboot_cpus();
if (freeze_processes()) {
@@ -163,15 +150,6 @@ static void unprepare_processes(void)
pm_restore_console();
}
-static int prepare_devices(void)
-{
- int error;
-
- if ((error = device_suspend(PMSG_FREEZE)))
- printk("Some devices failed to suspend\n");
- return error;
-}
-
/**
* pm_suspend_disk - The granpappy of power management.
*
@@ -188,17 +166,14 @@ int pm_suspend_disk(void)
error = prepare_processes();
if (error)
return error;
- error = prepare_devices();
+ error = device_suspend(PMSG_FREEZE);
if (error) {
+ printk("Some devices failed to suspend\n");
unprepare_processes();
return error;
}
- pr_debug("PM: Attempting to suspend to disk.\n");
- if (pm_disk_mode == PM_DISK_FIRMWARE)
- return pm_ops->enter(PM_SUSPEND_DISK);
-
pr_debug("PM: snapshotting memory.\n");
in_suspend = 1;
if ((error = swsusp_suspend()))
@@ -209,11 +184,20 @@ int pm_suspend_disk(void)
error = swsusp_write();
if (!error)
power_down(pm_disk_mode);
+ else {
+ /* swsusp_write can not fail in device_resume,
+ no need to do second device_resume */
+ swsusp_free();
+ unprepare_processes();
+ return error;
+ }
} else
pr_debug("PM: Image restored successfully.\n");
+
swsusp_free();
Done:
- finish();
+ device_resume();
+ unprepare_processes();
return error;
}
@@ -234,11 +218,25 @@ static int software_resume(void)
{
int error;
+ down(&pm_sem);
+ if (!swsusp_resume_device) {
+ if (!strlen(resume_file)) {
+ up(&pm_sem);
+ return -ENOENT;
+ }
+ swsusp_resume_device = name_to_dev_t(resume_file);
+ pr_debug("swsusp: Resume From Partition %s\n", resume_file);
+ } else {
+ pr_debug("swsusp: Resume From Partition %d:%d\n",
+ MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
+ }
+
if (noresume) {
/**
* FIXME: If noresume is specified, we need to find the partition
* and reset it back to normal swap space.
*/
+ up(&pm_sem);
return 0;
}
@@ -261,20 +259,24 @@ static int software_resume(void)
pr_debug("PM: Preparing devices for restore.\n");
- if ((error = prepare_devices()))
+ if ((error = device_suspend(PMSG_FREEZE))) {
+ printk("Some devices failed to suspend\n");
goto Free;
+ }
mb();
pr_debug("PM: Restoring saved image.\n");
swsusp_resume();
pr_debug("PM: Restore failed, recovering.n");
- finish();
+ device_resume();
Free:
swsusp_free();
Cleanup:
unprepare_processes();
Done:
+ /* For success case, the suspend path will release the lock */
+ up(&pm_sem);
pr_debug("PM: Resume from disk failed.\n");
return 0;
}
@@ -381,7 +383,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t
if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
res = MKDEV(maj,min);
if (maj == MAJOR(res) && min == MINOR(res)) {
+ down(&pm_sem);
swsusp_resume_device = res;
+ up(&pm_sem);
printk("Attempting manual resume\n");
noresume = 0;
software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index c94cb9e95090..22bdc93cc038 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -19,6 +19,9 @@
#include "power.h"
+/*This is just an arbitrary number */
+#define FREE_PAGE_NUMBER (100)
+
DECLARE_MUTEX(pm_sem);
struct pm_ops * pm_ops = NULL;
@@ -49,6 +52,7 @@ void pm_set_ops(struct pm_ops * ops)
static int suspend_prepare(suspend_state_t state)
{
int error = 0;
+ unsigned int free_pages;
if (!pm_ops || !pm_ops->enter)
return -EPERM;
@@ -67,6 +71,16 @@ static int suspend_prepare(suspend_state_t state)
goto Thaw;
}
+ if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) {
+ pr_debug("PM: free some memory\n");
+ shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
+ if (nr_free_pages() < FREE_PAGE_NUMBER) {
+ error = -ENOMEM;
+ printk(KERN_ERR "PM: No enough memory\n");
+ goto Thaw;
+ }
+ }
+
if (pm_ops->prepare) {
if ((error = pm_ops->prepare(state)))
goto Thaw;
@@ -129,11 +143,12 @@ static void suspend_finish(suspend_state_t state)
-static char * pm_states[] = {
+static char *pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
+#ifdef CONFIG_SOFTWARE_SUSPEND
[PM_SUSPEND_DISK] = "disk",
- NULL,
+#endif
};
@@ -194,7 +209,7 @@ int software_suspend(void)
int pm_suspend(suspend_state_t state)
{
- if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
+ if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
return enter_state(state);
return -EINVAL;
}
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
unsigned long id,
pm_callback callback)
{
- struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL);
+ struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
if (dev) {
- memset(dev, 0, sizeof(*dev));
dev->type = type;
dev->id = id;
dev->callback = callback;
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 715081b2d829..7a4144ba3afd 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -9,6 +9,7 @@
#include <linux/init.h>
#include <linux/pm.h>
#include <linux/workqueue.h>
+#include <linux/reboot.h>
/*
* When the user hits Sys-Rq o to power down the machine this is the
@@ -17,8 +18,7 @@
static void do_poweroff(void *dummy)
{
- if (pm_power_off)
- pm_power_off();
+ kernel_power_off();
}
static DECLARE_WORK(poweroff_work, do_poweroff, NULL);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0a086640bcfc..28de118f7a0b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -38,7 +38,6 @@ void refrigerator(void)
processes around? */
long save;
save = current->state;
- current->state = TASK_UNINTERRUPTIBLE;
pr_debug("%s entered refrigerator\n", current->comm);
printk("=");
@@ -47,8 +46,10 @@ void refrigerator(void)
recalc_sigpending(); /* We sent fake signal, clean it up */
spin_unlock_irq(&current->sighand->siglock);
- while (frozen(current))
+ while (frozen(current)) {
+ current->state = TASK_UNINTERRUPTIBLE;
schedule();
+ }
pr_debug("%s left refrigerator\n", current->comm);
current->state = save;
}
@@ -59,6 +60,7 @@ int freeze_processes(void)
int todo;
unsigned long start_time;
struct task_struct *g, *p;
+ unsigned long flags;
printk( "Stopping tasks: " );
start_time = jiffies;
@@ -66,12 +68,9 @@ int freeze_processes(void)
todo = 0;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
- unsigned long flags;
if (!freezeable(p))
continue;
- if ((frozen(p)) ||
- (p->state == TASK_TRACED) ||
- (p->state == TASK_STOPPED))
+ if (frozen(p))
continue;
freeze(p);
@@ -82,13 +81,33 @@ int freeze_processes(void)
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
yield(); /* Yield is okay here */
- if (time_after(jiffies, start_time + TIMEOUT)) {
+ if (todo && time_after(jiffies, start_time + TIMEOUT)) {
printk( "\n" );
printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
- return todo;
+ break;
}
} while(todo);
+ /* This does not unfreeze processes that are already frozen
+ * (we have slightly ugly calling convention in that respect,
+ * and caller must call thaw_processes() if something fails),
+ * but it cleans up leftover PF_FREEZE requests.
+ */
+ if (todo) {
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p)
+ if (freezing(p)) {
+ pr_debug(" clean up: %s\n", p->comm);
+ p->flags &= ~PF_FREEZE;
+ spin_lock_irqsave(&p->sighand->siglock, flags);
+ recalc_sigpending_tsk(p);
+ spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ }
+ while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+ return todo;
+ }
+
printk( "|\n" );
BUG_ON(in_atomic());
return 0;
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index bbe23079c62c..911fc62b8225 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -38,7 +38,7 @@ void disable_nonboot_cpus(void)
}
printk("Error taking cpu %d down: %d\n", cpu, error);
}
- BUG_ON(smp_processor_id() != 0);
+ BUG_ON(raw_smp_processor_id() != 0);
if (error)
panic("cpus not sleeping");
}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c285fc5a2320..d967e875ee82 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,6 +31,9 @@
* Alex Badea <vampire@go.ro>:
* Fixed runaway init
*
+ * Andreas Steinmetz <ast@domdv.de>:
+ * Added encrypted suspend option
+ *
* More state savers are welcome. Especially for the scsi layer...
*
* For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -63,6 +66,7 @@
#include <linux/console.h>
#include <linux/highmem.h>
#include <linux/bio.h>
+#include <linux/mount.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
@@ -70,8 +74,16 @@
#include <asm/tlbflush.h>
#include <asm/io.h>
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+
#include "power.h"
+#define CIPHER "aes"
+#define MAXKEY 32
+#define MAXIV 32
+
/* References to section boundaries */
extern const void __nosave_begin, __nosave_end;
@@ -102,7 +114,8 @@ static suspend_pagedir_t *pagedir_save;
#define SWSUSP_SIG "S1SUSPEND"
static struct swsusp_header {
- char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)];
+ char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)];
+ u8 key_iv[MAXKEY+MAXIV];
swp_entry_t swsusp_info;
char orig_sig[10];
char sig[10];
@@ -128,6 +141,131 @@ static struct swsusp_info swsusp_info;
static unsigned short swapfile_used[MAX_SWAPFILES];
static unsigned short root_swap;
+static int write_page(unsigned long addr, swp_entry_t * loc);
+static int bio_read_page(pgoff_t page_off, void * page);
+
+static u8 key_iv[MAXKEY+MAXIV];
+
+#ifdef CONFIG_SWSUSP_ENCRYPT
+
+static int crypto_init(int mode, void **mem)
+{
+ int error = 0;
+ int len;
+ char *modemsg;
+ struct crypto_tfm *tfm;
+
+ modemsg = mode ? "suspend not possible" : "resume not possible";
+
+ tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
+ if(!tfm) {
+ printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
+ error = -EINVAL;
+ goto out;
+ }
+
+ if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
+ printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
+ error = -ENOKEY;
+ goto fail;
+ }
+
+ if (mode)
+ get_random_bytes(key_iv, MAXKEY+MAXIV);
+
+ len = crypto_tfm_alg_max_keysize(tfm);
+ if (len > MAXKEY)
+ len = MAXKEY;
+
+ if (crypto_cipher_setkey(tfm, key_iv, len)) {
+ printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
+ error = -EKEYREJECTED;
+ goto fail;
+ }
+
+ len = crypto_tfm_alg_ivsize(tfm);
+
+ if (MAXIV < len) {
+ printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
+ error = -EOVERFLOW;
+ goto fail;
+ }
+
+ crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
+
+ *mem=(void *)tfm;
+
+ goto out;
+
+fail: crypto_free_tfm(tfm);
+out: return error;
+}
+
+static __inline__ void crypto_exit(void *mem)
+{
+ crypto_free_tfm((struct crypto_tfm *)mem);
+}
+
+static __inline__ int crypto_write(struct pbe *p, void *mem)
+{
+ int error = 0;
+ struct scatterlist src, dst;
+
+ src.page = virt_to_page(p->address);
+ src.offset = 0;
+ src.length = PAGE_SIZE;
+ dst.page = virt_to_page((void *)&swsusp_header);
+ dst.offset = 0;
+ dst.length = PAGE_SIZE;
+
+ error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
+ PAGE_SIZE);
+
+ if (!error)
+ error = write_page((unsigned long)&swsusp_header,
+ &(p->swap_address));
+ return error;
+}
+
+static __inline__ int crypto_read(struct pbe *p, void *mem)
+{
+ int error = 0;
+ struct scatterlist src, dst;
+
+ error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
+ if (!error) {
+ src.offset = 0;
+ src.length = PAGE_SIZE;
+ dst.offset = 0;
+ dst.length = PAGE_SIZE;
+ src.page = dst.page = virt_to_page((void *)p->address);
+
+ error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
+ &src, PAGE_SIZE);
+ }
+ return error;
+}
+#else
+static __inline__ int crypto_init(int mode, void *mem)
+{
+ return 0;
+}
+
+static __inline__ void crypto_exit(void *mem)
+{
+}
+
+static __inline__ int crypto_write(struct pbe *p, void *mem)
+{
+ return write_page(p->address, &(p->swap_address));
+}
+
+static __inline__ int crypto_read(struct pbe *p, void *mem)
+{
+ return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
+}
+#endif
+
static int mark_swapfiles(swp_entry_t prev)
{
int error;
@@ -139,6 +277,7 @@ static int mark_swapfiles(swp_entry_t prev)
!memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
+ memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV);
swsusp_header.swsusp_info = prev;
error = rw_swap_page_sync(WRITE,
swp_entry(root_swap, 0),
@@ -178,9 +317,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */
len=strlen(resume_file);
root_swap = 0xFFFF;
- swap_list_lock();
+ spin_lock(&swap_lock);
for (i=0; i<MAX_SWAPFILES; i++) {
- if (swap_info[i].flags == 0) {
+ if (!(swap_info[i].flags & SWP_WRITEOK)) {
swapfile_used[i]=SWAPFILE_UNUSED;
} else {
if (!len) {
@@ -201,7 +340,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
}
}
}
- swap_list_unlock();
+ spin_unlock(&swap_lock);
return (root_swap != 0xffff) ? 0 : -ENODEV;
}
@@ -215,12 +354,12 @@ static void lock_swapdevices(void)
{
int i;
- swap_list_lock();
+ spin_lock(&swap_lock);
for (i = 0; i< MAX_SWAPFILES; i++)
if (swapfile_used[i] == SWAPFILE_IGNORED) {
- swap_info[i].flags ^= 0xFF;
+ swap_info[i].flags ^= SWP_WRITEOK;
}
- swap_list_unlock();
+ spin_unlock(&swap_lock);
}
/**
@@ -285,6 +424,10 @@ static int data_write(void)
int error = 0, i = 0;
unsigned int mod = nr_copy_pages / 100;
struct pbe *p;
+ void *tfm;
+
+ if ((error = crypto_init(1, &tfm)))
+ return error;
if (!mod)
mod = 1;
@@ -293,11 +436,14 @@ static int data_write(void)
for_each_pbe (p, pagedir_nosave) {
if (!(i%mod))
printk( "\b\b\b\b%3d%%", i / mod );
- if ((error = write_page(p->address, &(p->swap_address))))
+ if ((error = crypto_write(p, tfm))) {
+ crypto_exit(tfm);
return error;
+ }
i++;
}
printk("\b\b\b\bdone\n");
+ crypto_exit(tfm);
return error;
}
@@ -384,7 +530,6 @@ static int write_pagedir(void)
* write_suspend_image - Write entire image and metadata.
*
*/
-
static int write_suspend_image(void)
{
int error;
@@ -399,6 +544,7 @@ static int write_suspend_image(void)
if ((error = close_swap()))
goto FreePagedir;
Done:
+ memset(key_iv, 0, MAXKEY+MAXIV);
return error;
FreePagedir:
free_pagedir_entries();
@@ -590,18 +736,7 @@ static void copy_data_pages(void)
static int calc_nr(int nr_copy)
{
- int extra = 0;
- int mod = !!(nr_copy % PBES_PER_PAGE);
- int diff = (nr_copy / PBES_PER_PAGE) + mod;
-
- do {
- extra += diff;
- nr_copy += diff;
- mod = !!(nr_copy % PBES_PER_PAGE);
- diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
- } while (diff > 0);
-
- return nr_copy;
+ return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
}
/**
@@ -869,13 +1004,6 @@ extern asmlinkage int swsusp_arch_resume(void);
asmlinkage int swsusp_save(void)
{
- int error = 0;
-
- if ((error = swsusp_swap_check())) {
- printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try "
- "swapon -a!\n");
- return error;
- }
return suspend_prepare_image();
}
@@ -894,12 +1022,19 @@ int swsusp_suspend(void)
if ((error = device_power_down(PMSG_FREEZE))) {
printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
local_irq_enable();
- swsusp_free();
return error;
}
+
+ if ((error = swsusp_swap_check())) {
+ printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
+ device_power_up();
+ local_irq_enable();
+ return error;
+ }
+
save_processor_state();
if ((error = swsusp_arch_suspend()))
- swsusp_free();
+ printk(KERN_ERR "Error %d suspending\n", error);
/* Restore control flow magically appears here */
restore_processor_state();
BUG_ON (nr_copy_pages_check != nr_copy_pages);
@@ -924,6 +1059,7 @@ int swsusp_resume(void)
BUG_ON(!error);
restore_processor_state();
restore_highmem();
+ touch_softlockup_watchdog();
device_power_up();
local_irq_enable();
return error;
@@ -1166,9 +1302,9 @@ static int bio_write_page(pgoff_t page_off, void * page)
static const char * sanity_check(void)
{
dump_info();
- if(swsusp_info.version_code != LINUX_VERSION_CODE)
+ if (swsusp_info.version_code != LINUX_VERSION_CODE)
return "kernel version";
- if(swsusp_info.num_physpages != num_physpages)
+ if (swsusp_info.num_physpages != num_physpages)
return "memory size";
if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
return "system type";
@@ -1179,7 +1315,8 @@ static const char * sanity_check(void)
if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
return "machine";
#if 0
- if(swsusp_info.cpus != num_online_cpus())
+ /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
+ if (swsusp_info.cpus != num_possible_cpus())
return "number of cpus";
#endif
return NULL;
@@ -1212,13 +1349,14 @@ static int check_sig(void)
return error;
if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+ memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
+ memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
/*
* Reset swap signature now.
*/
error = bio_write_page(0, &swsusp_header);
} else {
- printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
return -EINVAL;
}
if (!error)
@@ -1239,6 +1377,10 @@ static int data_read(struct pbe *pblist)
int error = 0;
int i = 0;
int mod = swsusp_info.image_pages / 100;
+ void *tfm;
+
+ if ((error = crypto_init(0, &tfm)))
+ return error;
if (!mod)
mod = 1;
@@ -1250,19 +1392,18 @@ static int data_read(struct pbe *pblist)
if (!(i % mod))
printk("\b\b\b\b%3d%%", i / mod);
- error = bio_read_page(swp_offset(p->swap_address),
- (void *)p->address);
- if (error)
+ if ((error = crypto_read(p, tfm))) {
+ crypto_exit(tfm);
return error;
+ }
i++;
}
printk("\b\b\b\bdone\n");
+ crypto_exit(tfm);
return error;
}
-extern dev_t name_to_dev_t(const char *line);
-
/**
* read_pagedir - Read page backup list pages from swap
*/
@@ -1356,16 +1497,6 @@ int swsusp_check(void)
{
int error;
- if (!swsusp_resume_device) {
- if (!strlen(resume_file))
- return -ENOENT;
- swsusp_resume_device = name_to_dev_t(resume_file);
- pr_debug("swsusp: Resume From Partition %s\n", resume_file);
- } else {
- pr_debug("swsusp: Resume From Partition %d:%d\n",
- MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device));
- }
-
resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
if (!IS_ERR(resume_bdev)) {
set_blocksize(resume_bdev, PAGE_SIZE);
@@ -1397,6 +1528,7 @@ int swsusp_read(void)
error = read_suspend_image();
blkdev_put(resume_bdev);
+ memset(key_iv, 0, MAXKEY+MAXIV);
if (!error)
pr_debug("swsusp: Reading resume file was successful\n");
diff --git a/kernel/printk.c b/kernel/printk.c
index 5092397fac29..a967605bc2e3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...)
return r;
}
+/* cpu currently holding logbuf_lock */
+static volatile unsigned int printk_cpu = UINT_MAX;
+
asmlinkage int vprintk(const char *fmt, va_list args)
{
unsigned long flags;
@@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
static char printk_buf[1024];
static int log_level_unknown = 1;
- if (unlikely(oops_in_progress))
+ preempt_disable();
+ if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
+ /* If a crash is occurring during printk() on this CPU,
+ * make sure we can't deadlock */
zap_locks();
/* This stops the holder of console_sem just where we want him */
spin_lock_irqsave(&logbuf_lock, flags);
+ printk_cpu = smp_processor_id();
/* Emit the output into the temporary buffer */
printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* CPU until it is officially up. We shouldn't be calling into
* random console drivers on a CPU which doesn't exist yet..
*/
+ printk_cpu = UINT_MAX;
spin_unlock_irqrestore(&logbuf_lock, flags);
goto out;
}
@@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* We own the drivers. We can drop the spinlock and let
* release_console_sem() print the text
*/
+ printk_cpu = UINT_MAX;
spin_unlock_irqrestore(&logbuf_lock, flags);
console_may_schedule = 0;
release_console_sem();
@@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* allows the semaphore holder to proceed and to call the
* console drivers with the output which we just produced.
*/
+ printk_cpu = UINT_MAX;
spin_unlock_irqrestore(&logbuf_lock, flags);
}
out:
+ preempt_enable();
return printed_len;
}
EXPORT_SYMBOL(printk);
diff --git a/kernel/profile.c b/kernel/profile.c
index ad8cbb75ffa2..f89248e6d704 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -35,11 +35,11 @@ struct profile_hit {
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
/* Oprofile timer tick hook */
-int (*timer_hook)(struct pt_regs *);
+int (*timer_hook)(struct pt_regs *) __read_mostly;
static atomic_t *prof_buffer;
static unsigned long prof_len, prof_shift;
-static int prof_on;
+static int prof_on __read_mostly;
static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..019e04ec065a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
return ret;
}
+static int may_attach(struct task_struct *task)
+{
+ if (!task->mm)
+ return -EPERM;
+ if (((current->uid != task->euid) ||
+ (current->uid != task->suid) ||
+ (current->uid != task->uid) ||
+ (current->gid != task->egid) ||
+ (current->gid != task->sgid) ||
+ (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
+ return -EPERM;
+ smp_rmb();
+ if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+ return -EPERM;
+
+ return security_ptrace(current, task);
+}
+
+int ptrace_may_attach(struct task_struct *task)
+{
+ int err;
+ task_lock(task);
+ err = may_attach(task);
+ task_unlock(task);
+ return !err;
+}
+
int ptrace_attach(struct task_struct *task)
{
int retval;
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task)
goto bad;
if (task == current)
goto bad;
- if (!task->mm)
- goto bad;
- if(((current->uid != task->euid) ||
- (current->uid != task->suid) ||
- (current->uid != task->uid) ||
- (current->gid != task->egid) ||
- (current->gid != task->sgid) ||
- (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
- goto bad;
- smp_rmb();
- if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
- goto bad;
/* the same process cannot be attached many times */
if (task->ptrace & PT_PTRACED)
goto bad;
- retval = security_ptrace(current, task);
+ retval = may_attach(task);
if (retval)
goto bad;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f436993bd590..bef3b6901b76 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
#include <linux/percpu.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
+#include <linux/rcuref.h>
#include <linux/cpu.h>
/* Definition for rcupdate control block. */
@@ -72,6 +73,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
static int maxbatch = 10;
+#ifndef __HAVE_ARCH_CMPXCHG
+/*
+ * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
+ * 32 bit atomic_t implementations, and a hash function similar to that
+ * for our refcounting needs.
+ * Can't help multiprocessors which donot have cmpxchg :(
+ */
+
+spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
+ [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
+};
+#endif
+
/**
* call_rcu - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
*/
struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
{
- struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL);
+ struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
if (res) {
- memset(res, 0, sizeof(*res));
res->name = name;
res->start = start;
res->end = start + n - 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f2182d42241..81b3a96ed2d0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
-void wait_task_inactive(task_t * p)
+void wait_task_inactive(task_t *p)
{
unsigned long flags;
runqueue_t *rq;
@@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
int local_group;
int i;
+ /* Skip over this group if it has no CPUs allowed */
+ if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+ goto nextgroup;
+
local_group = cpu_isset(this_cpu, group->cpumask);
- /* XXX: put a cpus allowed check */
/* Tally up the load of all CPUs in the group */
avg_load = 0;
@@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
min_load = avg_load;
idlest = group;
}
+nextgroup:
group = group->next;
} while (group != sd->groups);
@@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
/*
* find_idlest_queue - find the idlest runqueue among the cpus in group.
*/
-static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
+ cpumask_t tmp;
unsigned long load, min_load = ULONG_MAX;
int idlest = -1;
int i;
- for_each_cpu_mask(i, group->cpumask) {
+ /* Traverse only the allowed CPUs */
+ cpus_and(tmp, group->cpumask, p->cpus_allowed);
+
+ for_each_cpu_mask(i, tmp) {
load = source_load(i, 0);
if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag)
if (!group)
goto nextlevel;
- new_cpu = find_idlest_cpu(group, cpu);
+ new_cpu = find_idlest_cpu(group, t, cpu);
if (new_cpu == -1 || new_cpu == cpu)
goto nextlevel;
@@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p)
*
* returns failure only if the task is already active.
*/
-static int try_to_wake_up(task_t * p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync)
{
int cpu, this_cpu, success = 0;
unsigned long flags;
@@ -1252,6 +1261,16 @@ out_activate:
}
/*
+ * Tasks that have marked their sleep as noninteractive get
+ * woken up without updating their sleep average. (i.e. their
+ * sleep is handled in a priority-neutral manner, no priority
+ * boost and no penalty.)
+ */
+ if (old_state & TASK_NONINTERACTIVE)
+ __activate_task(p, rq);
+ else
+ activate_task(p, rq, cpu == this_cpu);
+ /*
* Sync wakeups (i.e. those types of wakeups where the waker
* has indicated that it will leave the CPU in short order)
* don't trigger a preemption, if the woken up task will run on
@@ -1259,7 +1278,6 @@ out_activate:
* the waker guarantees that the freshly woken up task is going
* to be considered on this CPU.)
*/
- activate_task(p, rq, cpu == this_cpu);
if (!sync || cpu != this_cpu) {
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
@@ -1274,7 +1292,7 @@ out:
return success;
}
-int fastcall wake_up_process(task_t * p)
+int fastcall wake_up_process(task_t *p)
{
return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
+void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
{
unsigned long flags;
int this_cpu, cpu;
@@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
* artificially, because any timeslice recovered here
* was given away by the parent in the first place.)
*/
-void fastcall sched_exit(task_t * p)
+void fastcall sched_exit(task_t *p)
{
unsigned long flags;
runqueue_t *rq;
@@ -1478,6 +1496,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
/**
* finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
* @prev: the thread we just switched away from.
*
* finish_task_switch must be called after the context switch, paired
@@ -1510,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
* Manfred Spraul <manfred@colorfullife.com>
*/
prev_task_flags = prev->flags;
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
finish_arch_switch(prev);
finish_lock_switch(rq, prev);
if (mm)
@@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
*/
static inline
int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
- struct sched_domain *sd, enum idle_type idle, int *all_pinned)
+ struct sched_domain *sd, enum idle_type idle,
+ int *all_pinned)
{
/*
* We do not migrate tasks that are:
@@ -1882,10 +1906,11 @@ out:
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
- unsigned long *imbalance, enum idle_type idle)
+ unsigned long *imbalance, enum idle_type idle, int *sd_idle)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ unsigned long max_pull;
int load_idx;
max_load = this_load = total_load = total_pwr = 0;
@@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
avg_load = 0;
for_each_cpu_mask(i, group->cpumask) {
+ if (*sd_idle && !idle_cpu(i))
+ *sd_idle = 0;
+
/* Bias balancing toward cpus of our domain */
if (local_group)
load = target_load(i, load_idx);
@@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
group = group->next;
} while (group != sd->groups);
- if (!busiest || this_load >= max_load)
+ if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
+
+ /* Don't want to pull so many tasks that a group would go idle */
+ max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+
/* How much load to actually move to equalise the imbalance */
- *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+ *imbalance = min(max_pull * busiest->cpu_power,
(avg_load - this_load) * this->cpu_power)
/ SCHED_LOAD_SCALE;
@@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
unsigned long imbalance;
int nr_moved, all_pinned = 0;
int active_balance = 0;
+ int sd_idle = 0;
+
+ if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+ sd_idle = 1;
- spin_lock(&this_rq->lock);
schedstat_inc(sd, lb_cnt[idle]);
- group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
@@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
* still unbalanced. nr_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- double_lock_balance(this_rq, busiest);
+ double_rq_lock(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle,
- &all_pinned);
- spin_unlock(&busiest->lock);
+ imbalance, sd, idle, &all_pinned);
+ double_rq_unlock(this_rq, busiest);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned))
goto out_balanced;
}
- spin_unlock(&this_rq->lock);
-
if (!nr_moved) {
schedstat_inc(sd, lb_failed[idle]);
sd->nr_balance_failed++;
@@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
spin_lock(&busiest->lock);
+
+ /* don't kick the migration_thread, if the curr
+ * task on busiest cpu can't be moved to this_cpu
+ */
+ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+ spin_unlock(&busiest->lock);
+ all_pinned = 1;
+ goto out_one_pinned;
+ }
+
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
@@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
sd->balance_interval *= 2;
}
+ if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
return nr_moved;
out_balanced:
- spin_unlock(&this_rq->lock);
-
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
+
+out_one_pinned:
/* tune up the balancing interval */
if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
return 0;
}
@@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
runqueue_t *busiest = NULL;
unsigned long imbalance;
int nr_moved = 0;
+ int sd_idle = 0;
+
+ if (sd->flags & SD_SHARE_CPUPOWER)
+ sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
- group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+ group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
if (!group) {
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
goto out_balanced;
@@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
BUG_ON(busiest == this_rq);
- /* Attempt to move tasks */
- double_lock_balance(this_rq, busiest);
-
schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
- nr_moved = move_tasks(this_rq, this_cpu, busiest,
+
+ nr_moved = 0;
+ if (busiest->nr_running > 1) {
+ /* Attempt to move tasks */
+ double_lock_balance(this_rq, busiest);
+ nr_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, NEWLY_IDLE, NULL);
- if (!nr_moved)
+ spin_unlock(&busiest->lock);
+ }
+
+ if (!nr_moved) {
schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
- else
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
+ } else
sd->nr_balance_failed = 0;
- spin_unlock(&busiest->lock);
return nr_moved;
out_balanced:
schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ return -1;
sd->nr_balance_failed = 0;
return 0;
}
@@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
if (j - sd->last_balance >= interval) {
if (load_balance(this_cpu, this_rq, sd, idle)) {
- /* We've pulled tasks over so no longer idle */
+ /*
+ * We've pulled tasks over so either we're no
+ * longer idle, or one of our SMT siblings is
+ * not idle.
+ */
idle = NOT_IDLE;
}
sd->last_balance += interval;
@@ -2575,6 +2637,13 @@ out:
}
#ifdef CONFIG_SCHED_SMT
+static inline void wakeup_busy_runqueue(runqueue_t *rq)
+{
+ /* If an SMT runqueue is sleeping due to priority reasons wake it up */
+ if (rq->curr == rq->idle && rq->nr_running)
+ resched_task(rq->idle);
+}
+
static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
{
struct sched_domain *tmp, *sd = NULL;
@@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
for_each_cpu_mask(i, sibling_map) {
runqueue_t *smt_rq = cpu_rq(i);
- /*
- * If an SMT sibling task is sleeping due to priority
- * reasons wake it up now.
- */
- if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
- resched_task(smt_rq->idle);
+ wakeup_busy_runqueue(smt_rq);
}
for_each_cpu_mask(i, sibling_map)
@@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
*/
}
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+ return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
{
struct sched_domain *tmp, *sd = NULL;
@@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
runqueue_t *smt_rq = cpu_rq(i);
task_t *smt_curr = smt_rq->curr;
+ /* Kernel threads do not participate in dependent sleeping */
+ if (!p->mm || !smt_curr->mm || rt_task(p))
+ goto check_smt_task;
+
/*
* If a user task with lower static priority than the
* running task on the SMT sibling is trying to schedule,
@@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
* task from using an unfair proportion of the
* physical cpu's resources. -ck
*/
- if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(p) || rt_task(smt_curr)) &&
- p->mm && smt_curr->mm && !rt_task(p))
- ret = 1;
+ if (rt_task(smt_curr)) {
+ /*
+ * With real time tasks we run non-rt tasks only
+ * per_cpu_gain% of the time.
+ */
+ if ((jiffies % DEF_TIMESLICE) >
+ (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+ ret = 1;
+ } else
+ if (smt_curr->static_prio < p->static_prio &&
+ !TASK_PREEMPTS_CURR(p, smt_rq) &&
+ smt_slice(smt_curr, sd) > task_timeslice(p))
+ ret = 1;
+
+check_smt_task:
+ if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
+ rt_task(smt_curr))
+ continue;
+ if (!p->mm) {
+ wakeup_busy_runqueue(smt_rq);
+ continue;
+ }
/*
- * Reschedule a lower priority task on the SMT sibling,
- * or wake it up if it has been put to sleep for priority
- * reasons.
+ * Reschedule a lower priority task on the SMT sibling for
+ * it to be put to sleep, or wake it up if it has been put to
+ * sleep for priority reasons to see if it should run now.
*/
- if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
- task_timeslice(smt_curr) || rt_task(p)) &&
- smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
- (smt_curr == smt_rq->idle && smt_rq->nr_running))
- resched_task(smt_curr);
+ if (rt_task(p)) {
+ if ((jiffies % DEF_TIMESLICE) >
+ (sd->per_cpu_gain * DEF_TIMESLICE / 100))
+ resched_task(smt_curr);
+ } else {
+ if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+ smt_slice(p, sd) > task_timeslice(smt_curr))
+ resched_task(smt_curr);
+ else
+ wakeup_busy_runqueue(smt_rq);
+ }
}
out_unlock:
for_each_cpu_mask(i, sibling_map)
@@ -2887,6 +2989,7 @@ switch_tasks:
if (next == rq->idle)
schedstat_inc(rq, sched_goidle);
prefetch(next);
+ prefetch_stack(next);
clear_tsk_need_resched(prev);
rcu_qsctr_inc(task_cpu(prev));
@@ -3014,7 +3117,8 @@ need_resched:
#endif /* CONFIG_PREEMPT */
-int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
+int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+ void *key)
{
task_t *p = curr->private;
return try_to_wake_up(p, mode, sync);
@@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
* @key: is directly passed to the wakeup function
*/
void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, void *key)
+ int nr_exclusive, void *key)
{
unsigned long flags;
@@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
*
* On UP it can prevent extra preemption.
*/
-void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void fastcall
+__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
{
unsigned long flags;
int sync = 1;
@@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
EXPORT_SYMBOL(interruptible_sleep_on);
-long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched
+interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
SLEEP_ON_VAR
@@ -3378,8 +3484,8 @@ EXPORT_SYMBOL(set_user_nice);
*/
int can_nice(const task_t *p, const int nice)
{
- /* convert nice value [19,-20] to rlimit style value [0,39] */
- int nice_rlim = 19 - nice;
+ /* convert nice value [19,-20] to rlimit style value [1,40] */
+ int nice_rlim = 20 - nice;
return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
capable(CAP_SYS_NICE));
}
@@ -3486,7 +3592,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
p->policy = policy;
p->rt_priority = prio;
if (policy != SCHED_NORMAL)
- p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority;
+ p->prio = MAX_RT_PRIO-1 - p->rt_priority;
else
p->prio = p->static_prio;
}
@@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
* @policy: new policy.
* @param: structure containing the new RT priority.
*/
-int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param)
+int sched_setscheduler(struct task_struct *p, int policy,
+ struct sched_param *param)
{
int retval;
int oldprio, oldpolicy = -1;
@@ -3518,7 +3625,8 @@ recheck:
* 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
*/
if (param->sched_priority < 0 ||
- param->sched_priority > MAX_USER_RT_PRIO-1)
+ (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+ (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
return -EINVAL;
if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
return -EINVAL;
@@ -3528,7 +3636,8 @@ recheck:
*/
if (!capable(CAP_SYS_NICE)) {
/* can't change policy */
- if (policy != p->policy)
+ if (policy != p->policy &&
+ !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
return -EPERM;
/* can't increase priority */
if (policy != SCHED_NORMAL &&
@@ -3579,7 +3688,8 @@ recheck:
}
EXPORT_SYMBOL_GPL(sched_setscheduler);
-static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
int retval;
struct sched_param lparam;
@@ -3846,7 +3956,7 @@ asmlinkage long sys_sched_yield(void)
if (rt_task(current))
target = rq->active;
- if (current->array->nr_active == 1) {
+ if (array->nr_active == 1) {
schedstat_inc(rq, yld_act_empty);
if (!rq->expired->nr_active)
schedstat_inc(rq, yld_both_empty);
@@ -3877,6 +3987,13 @@ asmlinkage long sys_sched_yield(void)
static inline void __cond_resched(void)
{
+ /*
+ * The BKS might be reacquired before we have dropped
+ * PREEMPT_ACTIVE, which could trigger a second
+ * cond_resched() call.
+ */
+ if (unlikely(preempt_count()))
+ return;
do {
add_preempt_count(PREEMPT_ACTIVE);
schedule();
@@ -3903,7 +4020,7 @@ EXPORT_SYMBOL(cond_resched);
* operations here to prevent schedule() from being called twice (once via
* spin_unlock(), once by hand).
*/
-int cond_resched_lock(spinlock_t * lock)
+int cond_resched_lock(spinlock_t *lock)
{
int ret = 0;
@@ -4086,7 +4203,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
return list_entry(p->sibling.next,struct task_struct,sibling);
}
-static void show_task(task_t * p)
+static void show_task(task_t *p)
{
task_t *relative;
unsigned state;
@@ -4112,7 +4229,7 @@ static void show_task(task_t * p)
#endif
#ifdef CONFIG_DEBUG_STACK_USAGE
{
- unsigned long * n = (unsigned long *) (p->thread_info+1);
+ unsigned long *n = (unsigned long *) (p->thread_info+1);
while (!*n)
n++;
free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4321,7 +4438,7 @@ out:
* thread migration by bumping thread off CPU then 'pushing' onto
* another runqueue.
*/
-static int migration_thread(void * data)
+static int migration_thread(void *data)
{
runqueue_t *rq;
int cpu = (long)data;
@@ -4770,7 +4887,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
*/
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
{
runqueue_t *rq = cpu_rq(cpu);
struct sched_domain *tmp;
@@ -4793,7 +4910,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
}
/* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
@@ -4821,8 +4938,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
-void init_sched_build_groups(struct sched_group groups[],
- cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+ int (*group_fn)(int cpu))
{
struct sched_group *first = NULL, *last = NULL;
cpumask_t covered = CPU_MASK_NONE;
@@ -4855,12 +4972,85 @@ void init_sched_build_groups(struct sched_group groups[],
last->next = first;
}
+#define SD_NODES_PER_DOMAIN 16
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain. Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+ int i, n, val, min_val, best_node = 0;
+
+ min_val = INT_MAX;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ /* Start at @node */
+ n = (node + i) % MAX_NUMNODES;
+
+ if (!nr_cpus_node(n))
+ continue;
+
+ /* Skip already used nodes */
+ if (test_bit(n, used_nodes))
+ continue;
+
+ /* Simple min distance search */
+ val = node_distance(node, n);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ set_bit(best_node, used_nodes);
+ return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span. It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+ int i;
+ cpumask_t span, nodemask;
+ DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+ cpus_clear(span);
+ bitmap_zero(used_nodes, MAX_NUMNODES);
+
+ nodemask = node_to_cpumask(node);
+ cpus_or(span, span, nodemask);
+ set_bit(node, used_nodes);
+
+ for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+ int next_node = find_next_best_node(node, used_nodes);
+ nodemask = node_to_cpumask(next_node);
+ cpus_or(span, span, nodemask);
+ }
+
+ return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4882,36 +5072,20 @@ static int cpu_to_phys_group(int cpu)
}
#ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
- return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
/*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
*/
-static void check_sibling_maps(void)
-{
- int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
- for_each_online_cpu(i) {
- for_each_cpu_mask(j, cpu_sibling_map[i]) {
- if (cpu_to_node(i) != cpu_to_node(j)) {
- printk(KERN_INFO "warning: CPU %d siblings map "
- "to different node - isolating "
- "them.\n", i);
- cpu_sibling_map[i] = cpumask_of_cpu(i);
- break;
- }
- }
- }
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+ return cpu_to_node(cpu);
}
#endif
@@ -4919,9 +5093,24 @@ static void check_sibling_maps(void)
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
{
int i;
+#ifdef CONFIG_NUMA
+ struct sched_group **sched_group_nodes = NULL;
+ struct sched_group *sched_group_allnodes = NULL;
+
+ /*
+ * Allocate the per-node list of sched groups
+ */
+ sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+ GFP_ATOMIC);
+ if (!sched_group_nodes) {
+ printk(KERN_WARNING "Can not alloc sched group node list\n");
+ return;
+ }
+ sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
/*
* Set up domains for cpus specified by the cpu_map.
@@ -4934,11 +5123,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
cpus_and(nodemask, nodemask, *cpu_map);
#ifdef CONFIG_NUMA
+ if (cpus_weight(*cpu_map)
+ > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+ if (!sched_group_allnodes) {
+ sched_group_allnodes
+ = kmalloc(sizeof(struct sched_group)
+ * MAX_NUMNODES,
+ GFP_KERNEL);
+ if (!sched_group_allnodes) {
+ printk(KERN_WARNING
+ "Can not alloc allnodes sched group\n");
+ break;
+ }
+ sched_group_allnodes_bycpu[i]
+ = sched_group_allnodes;
+ }
+ sd = &per_cpu(allnodes_domains, i);
+ *sd = SD_ALLNODES_INIT;
+ sd->span = *cpu_map;
+ group = cpu_to_allnodes_group(i);
+ sd->groups = &sched_group_allnodes[group];
+ p = sd;
+ } else
+ p = NULL;
+
sd = &per_cpu(node_domains, i);
- group = cpu_to_node_group(i);
*sd = SD_NODE_INIT;
- sd->span = *cpu_map;
- sd->groups = &sched_group_nodes[group];
+ sd->span = sched_domain_node_span(cpu_to_node(i));
+ sd->parent = p;
+ cpus_and(sd->span, sd->span, *cpu_map);
#endif
p = sd;
@@ -4963,7 +5176,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
#ifdef CONFIG_SCHED_SMT
/* Set up CPU (sibling) groups */
- for_each_online_cpu(i) {
+ for_each_cpu_mask(i, *cpu_map) {
cpumask_t this_sibling_map = cpu_sibling_map[i];
cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
if (i != first_cpu(this_sibling_map))
@@ -4988,8 +5201,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
#ifdef CONFIG_NUMA
/* Set up node groups */
- init_sched_build_groups(sched_group_nodes, *cpu_map,
- &cpu_to_node_group);
+ if (sched_group_allnodes)
+ init_sched_build_groups(sched_group_allnodes, *cpu_map,
+ &cpu_to_allnodes_group);
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ /* Set up node groups */
+ struct sched_group *sg, *prev;
+ cpumask_t nodemask = node_to_cpumask(i);
+ cpumask_t domainspan;
+ cpumask_t covered = CPU_MASK_NONE;
+ int j;
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask)) {
+ sched_group_nodes[i] = NULL;
+ continue;
+ }
+
+ domainspan = sched_domain_node_span(i);
+ cpus_and(domainspan, domainspan, *cpu_map);
+
+ sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ sched_group_nodes[i] = sg;
+ for_each_cpu_mask(j, nodemask) {
+ struct sched_domain *sd;
+ sd = &per_cpu(node_domains, j);
+ sd->groups = sg;
+ if (sd->groups == NULL) {
+ /* Turn off balancing if we have no groups */
+ sd->flags = 0;
+ }
+ }
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", i);
+ continue;
+ }
+ sg->cpu_power = 0;
+ sg->cpumask = nodemask;
+ cpus_or(covered, covered, nodemask);
+ prev = sg;
+
+ for (j = 0; j < MAX_NUMNODES; j++) {
+ cpumask_t tmp, notcovered;
+ int n = (i + j) % MAX_NUMNODES;
+
+ cpus_complement(notcovered, covered);
+ cpus_and(tmp, notcovered, *cpu_map);
+ cpus_and(tmp, tmp, domainspan);
+ if (cpus_empty(tmp))
+ break;
+
+ nodemask = node_to_cpumask(n);
+ cpus_and(tmp, tmp, nodemask);
+ if (cpus_empty(tmp))
+ continue;
+
+ sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ if (!sg) {
+ printk(KERN_WARNING
+ "Can not alloc domain group for node %d\n", j);
+ break;
+ }
+ sg->cpu_power = 0;
+ sg->cpumask = tmp;
+ cpus_or(covered, covered, tmp);
+ prev->next = sg;
+ prev = sg;
+ }
+ prev->next = sched_group_nodes[i];
+ }
#endif
/* Calculate CPU power for physical packages and nodes */
@@ -5008,14 +5290,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
sd->groups->cpu_power = power;
#ifdef CONFIG_NUMA
- if (i == first_cpu(sd->groups->cpumask)) {
- /* Only add "power" once for each physical package. */
- sd = &per_cpu(node_domains, i);
- sd->groups->cpu_power += power;
+ sd = &per_cpu(allnodes_domains, i);
+ if (sd->groups) {
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
+ sd->groups->cpu_power = power;
}
#endif
}
+#ifdef CONFIG_NUMA
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ struct sched_group *sg = sched_group_nodes[i];
+ int j;
+
+ if (sg == NULL)
+ continue;
+next_sg:
+ for_each_cpu_mask(j, sg->cpumask) {
+ struct sched_domain *sd;
+ int power;
+
+ sd = &per_cpu(phys_domains, j);
+ if (j != first_cpu(sd->groups->cpumask)) {
+ /*
+ * Only add "power" once for each
+ * physical package.
+ */
+ continue;
+ }
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
+
+ sg->cpu_power += power;
+ }
+ sg = sg->next;
+ if (sg != sched_group_nodes[i])
+ goto next_sg;
+ }
+#endif
+
/* Attach the domains */
for_each_cpu_mask(i, *cpu_map) {
struct sched_domain *sd;
@@ -5030,13 +5344,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
/*
* Set up scheduler domains and groups. Callers must hold the hotplug lock.
*/
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
{
cpumask_t cpu_default_map;
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
- check_sibling_maps();
-#endif
/*
* Setup mask for cpus without special case scheduling requirements.
* For now this just excludes isolated cpus, but could be used to
@@ -5049,10 +5360,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
{
- /* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+ int i;
+ int cpu;
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+ for_each_cpu_mask(cpu, *cpu_map) {
+ struct sched_group *sched_group_allnodes
+ = sched_group_allnodes_bycpu[cpu];
+ struct sched_group **sched_group_nodes
+ = sched_group_nodes_bycpu[cpu];
+
+ if (sched_group_allnodes) {
+ kfree(sched_group_allnodes);
+ sched_group_allnodes_bycpu[cpu] = NULL;
+ }
+
+ if (!sched_group_nodes)
+ continue;
+
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+ struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+ cpus_and(nodemask, nodemask, *cpu_map);
+ if (cpus_empty(nodemask))
+ continue;
+
+ if (sg == NULL)
+ continue;
+ sg = sg->next;
+next_sg:
+ oldsg = sg;
+ sg = sg->next;
+ kfree(oldsg);
+ if (oldsg != sched_group_nodes[i])
+ goto next_sg;
+ }
+ kfree(sched_group_nodes);
+ sched_group_nodes_bycpu[cpu] = NULL;
+ }
+#endif
+}
/*
* Detach sched domains from a group of cpus specified in cpu_map
@@ -5254,3 +5602,47 @@ void normalize_rt_tasks(void)
}
#endif /* CONFIG_MAGIC_SYSRQ */
+
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+task_t *curr_task(int cpu)
+{
+ return cpu_curr(cpu);
+}
+
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack. It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner. This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, task_t *p)
+{
+ cpu_curr(cpu) = p;
+}
+
+#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index ca1186eef938..b92c3c9f8b9a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
/* forward decl */
static void do_notify_parent_cldstop(struct task_struct *tsk,
- struct task_struct *parent,
+ int to_self,
int why);
/*
@@ -692,7 +692,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
{
struct task_struct *t;
- if (p->flags & SIGNAL_GROUP_EXIT)
+ if (p->signal->flags & SIGNAL_GROUP_EXIT)
/*
* The process is in the middle of dying already.
*/
@@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
p->signal->group_stop_count = 0;
p->signal->flags = SIGNAL_STOP_CONTINUED;
spin_unlock(&p->sighand->siglock);
- if (p->ptrace & PT_PTRACED)
- do_notify_parent_cldstop(p, p->parent,
- CLD_STOPPED);
- else
- do_notify_parent_cldstop(
- p->group_leader,
- p->group_leader->real_parent,
- CLD_STOPPED);
+ do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
spin_lock(&p->sighand->siglock);
}
rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
p->signal->flags = SIGNAL_STOP_CONTINUED;
p->signal->group_exit_code = 0;
spin_unlock(&p->sighand->siglock);
- if (p->ptrace & PT_PTRACED)
- do_notify_parent_cldstop(p, p->parent,
- CLD_CONTINUED);
- else
- do_notify_parent_cldstop(
- p->group_leader,
- p->group_leader->real_parent,
- CLD_CONTINUED);
+ do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
spin_lock(&p->sighand->siglock);
} else {
/*
@@ -1380,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
unsigned long flags;
int ret = 0;
- /*
- * We need the tasklist lock even for the specific
- * thread case (when we don't need to follow the group
- * lists) in order to avoid races with "p->sighand"
- * going away or changing from under us.
- */
BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
- read_lock(&tasklist_lock);
+ read_lock(&tasklist_lock);
+
+ if (unlikely(p->flags & PF_EXITING)) {
+ ret = -1;
+ goto out_err;
+ }
+
spin_lock_irqsave(&p->sighand->siglock, flags);
-
+
if (unlikely(!list_empty(&q->list))) {
/*
* If an SI_TIMER entry is already queue just increment
@@ -1399,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
BUG();
q->info.si_overrun++;
goto out;
- }
+ }
/* Short-circuit ignored signals. */
if (sig_ignored(p, sig)) {
ret = 1;
@@ -1414,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
out:
spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
read_unlock(&tasklist_lock);
- return(ret);
+
+ return ret;
}
int
@@ -1542,14 +1530,20 @@ void do_notify_parent(struct task_struct *tsk, int sig)
spin_unlock_irqrestore(&psig->siglock, flags);
}
-static void
-do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
- int why)
+static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
{
struct siginfo info;
unsigned long flags;
+ struct task_struct *parent;
struct sighand_struct *sighand;
+ if (to_self)
+ parent = tsk->parent;
+ else {
+ tsk = tsk->group_leader;
+ parent = tsk->real_parent;
+ }
+
info.si_signo = SIGCHLD;
info.si_errno = 0;
info.si_pid = tsk->pid;
@@ -1618,8 +1612,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
!(current->ptrace & PT_ATTACHED)) &&
(likely(current->parent->signal != current->signal) ||
!unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
- do_notify_parent_cldstop(current, current->parent,
- CLD_TRAPPED);
+ do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
read_unlock(&tasklist_lock);
schedule();
} else {
@@ -1668,25 +1661,25 @@ void ptrace_notify(int exit_code)
static void
finish_stop(int stop_count)
{
+ int to_self;
+
/*
* If there are no other threads in the group, or if there is
* a group stop in progress and we are the last to stop,
* report to the parent. When ptraced, every thread reports itself.
*/
- if (stop_count < 0 || (current->ptrace & PT_PTRACED)) {
- read_lock(&tasklist_lock);
- do_notify_parent_cldstop(current, current->parent,
- CLD_STOPPED);
- read_unlock(&tasklist_lock);
- }
- else if (stop_count == 0) {
- read_lock(&tasklist_lock);
- do_notify_parent_cldstop(current->group_leader,
- current->group_leader->real_parent,
- CLD_STOPPED);
- read_unlock(&tasklist_lock);
- }
+ if (stop_count < 0 || (current->ptrace & PT_PTRACED))
+ to_self = 1;
+ else if (stop_count == 0)
+ to_self = 0;
+ else
+ goto out;
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
+ read_unlock(&tasklist_lock);
+
+out:
schedule();
/*
* Now we don't run again until continued.
@@ -2228,8 +2221,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
- current->state = TASK_INTERRUPTIBLE;
- timeout = schedule_timeout(timeout);
+ timeout = schedule_timeout_interruptible(timeout);
try_to_freeze();
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4ab6af1dea8..f766b2fc48be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void)
cpu = smp_processor_id();
restart:
/* Reset the pending bitmask before enabling irqs */
- local_softirq_pending() = 0;
+ set_softirq_pending(0);
local_irq_enable();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, (C) 2005, Red Hat
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+
+static DEFINE_SPINLOCK(print_lock);
+
+static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+
+static int did_panic = 0;
+static int softlock_panic(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ did_panic = 1;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+ .notifier_call = softlock_panic,
+};
+
+void touch_softlockup_watchdog(void)
+{
+ per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(struct pt_regs *regs)
+{
+ int this_cpu = smp_processor_id();
+ unsigned long timestamp = per_cpu(timestamp, this_cpu);
+
+ if (per_cpu(print_timestamp, this_cpu) == timestamp)
+ return;
+
+ /* Do not cause a second panic when there already was one */
+ if (did_panic)
+ return;
+
+ if (time_after(jiffies, timestamp + 10*HZ)) {
+ per_cpu(print_timestamp, this_cpu) = timestamp;
+
+ spin_lock(&print_lock);
+ printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+ this_cpu);
+ show_regs(regs);
+ spin_unlock(&print_lock);
+ }
+}
+
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+ struct sched_param param = { .sched_priority = 99 };
+ int this_cpu = (long) __bind_cpu;
+
+ printk("softlockup thread %d started up.\n", this_cpu);
+
+ sched_setscheduler(current, SCHED_FIFO, &param);
+ current->flags |= PF_NOFREEZE;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /*
+ * Run briefly once per second - if this gets delayed for
+ * more than 10 seconds then the debug-printout triggers
+ * in softlockup_tick():
+ */
+ while (!kthread_should_stop()) {
+ msleep_interruptible(1000);
+ touch_softlockup_watchdog();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return 0;
+}
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __devinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int hotcpu = (unsigned long)hcpu;
+ struct task_struct *p;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ BUG_ON(per_cpu(watchdog_task, hotcpu));
+ p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+ if (IS_ERR(p)) {
+ printk("watchdog for %i failed\n", hotcpu);
+ return NOTIFY_BAD;
+ }
+ per_cpu(watchdog_task, hotcpu) = p;
+ kthread_bind(p, hotcpu);
+ break;
+ case CPU_ONLINE:
+
+ wake_up_process(per_cpu(watchdog_task, hotcpu));
+ break;
+#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_UP_CANCELED:
+ /* Unbind so it can run. Fall thru. */
+ kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+ case CPU_DEAD:
+ p = per_cpu(watchdog_task, hotcpu);
+ per_cpu(watchdog_task, hotcpu) = NULL;
+ kthread_stop(p);
+ break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+ .notifier_call = cpu_callback
+};
+
+__init void spawn_softlockup_task(void)
+{
+ void *cpu = (void *)(long)smp_processor_id();
+
+ cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+ cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+ register_cpu_notifier(&cpu_nfb);
+
+ notifier_chain_register(&panic_notifier_list, &panic_block);
+}
+
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0c3f9d8bbe17..0375fcd5921d 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -3,7 +3,10 @@
*
* Author: Zwane Mwaikambo <zwane@fsmlabs.com>
*
- * Copyright (2004) Ingo Molnar
+ * Copyright (2004, 2005) Ingo Molnar
+ *
+ * This file contains the spinlock/rwlock implementations for the
+ * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
*/
#include <linux/config.h>
@@ -17,12 +20,12 @@
* Generic declaration of the raw read_trylock() function,
* architectures are supposed to optimize this:
*/
-int __lockfunc generic_raw_read_trylock(rwlock_t *lock)
+int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
{
- _raw_read_lock(lock);
+ __raw_read_lock(lock);
return 1;
}
-EXPORT_SYMBOL(generic_raw_read_trylock);
+EXPORT_SYMBOL(generic__raw_read_trylock);
int __lockfunc _spin_trylock(spinlock_t *lock)
{
@@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock)
}
EXPORT_SYMBOL(_write_trylock);
-#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
void __lockfunc _read_lock(rwlock_t *lock)
{
@@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
local_irq_save(flags);
preempt_disable();
- _raw_spin_lock_flags(lock, flags);
+ _raw_spin_lock_flags(lock, &flags);
return flags;
}
EXPORT_SYMBOL(_spin_lock_irqsave);
diff --git a/kernel/sys.c b/kernel/sys.c
index 9a24374c23bc..c80412be2302 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,6 +361,64 @@ out_unlock:
return retval;
}
+void emergency_restart(void)
+{
+ machine_emergency_restart();
+}
+EXPORT_SYMBOL_GPL(emergency_restart);
+
+void kernel_restart(char *cmd)
+{
+ notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+ system_state = SYSTEM_RESTART;
+ device_shutdown();
+ if (!cmd) {
+ printk(KERN_EMERG "Restarting system.\n");
+ } else {
+ printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+ }
+ printk(".\n");
+ machine_restart(cmd);
+}
+EXPORT_SYMBOL_GPL(kernel_restart);
+
+void kernel_kexec(void)
+{
+#ifdef CONFIG_KEXEC
+ struct kimage *image;
+ image = xchg(&kexec_image, 0);
+ if (!image) {
+ return;
+ }
+ notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+ system_state = SYSTEM_RESTART;
+ device_shutdown();
+ printk(KERN_EMERG "Starting new kernel\n");
+ machine_shutdown();
+ machine_kexec(image);
+#endif
+}
+EXPORT_SYMBOL_GPL(kernel_kexec);
+
+void kernel_halt(void)
+{
+ notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
+ system_state = SYSTEM_HALT;
+ device_shutdown();
+ printk(KERN_EMERG "System halted.\n");
+ machine_halt();
+}
+EXPORT_SYMBOL_GPL(kernel_halt);
+
+void kernel_power_off(void)
+{
+ notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
+ system_state = SYSTEM_POWER_OFF;
+ device_shutdown();
+ printk(KERN_EMERG "Power down.\n");
+ machine_power_off();
+}
+EXPORT_SYMBOL_GPL(kernel_power_off);
/*
* Reboot system call: for obvious reasons only root may call it,
@@ -389,11 +447,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
lock_kernel();
switch (cmd) {
case LINUX_REBOOT_CMD_RESTART:
- notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
- system_state = SYSTEM_RESTART;
- device_shutdown();
- printk(KERN_EMERG "Restarting system.\n");
- machine_restart(NULL);
+ kernel_restart(NULL);
break;
case LINUX_REBOOT_CMD_CAD_ON:
@@ -405,23 +459,13 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
break;
case LINUX_REBOOT_CMD_HALT:
- notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
- system_state = SYSTEM_HALT;
- device_suspend(PMSG_SUSPEND);
- device_shutdown();
- printk(KERN_EMERG "System halted.\n");
- machine_halt();
+ kernel_halt();
unlock_kernel();
do_exit(0);
break;
case LINUX_REBOOT_CMD_POWER_OFF:
- notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
- system_state = SYSTEM_POWER_OFF;
- device_suspend(PMSG_SUSPEND);
- device_shutdown();
- printk(KERN_EMERG "Power down.\n");
- machine_power_off();
+ kernel_power_off();
unlock_kernel();
do_exit(0);
break;
@@ -433,32 +477,14 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
}
buffer[sizeof(buffer) - 1] = '\0';
- notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
- system_state = SYSTEM_RESTART;
- device_suspend(PMSG_FREEZE);
- device_shutdown();
- printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
- machine_restart(buffer);
+ kernel_restart(buffer);
break;
-#ifdef CONFIG_KEXEC
case LINUX_REBOOT_CMD_KEXEC:
- {
- struct kimage *image;
- image = xchg(&kexec_image, 0);
- if (!image) {
- unlock_kernel();
- return -EINVAL;
- }
- notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
- system_state = SYSTEM_RESTART;
- device_shutdown();
- printk(KERN_EMERG "Starting new kernel\n");
- machine_shutdown();
- machine_kexec(image);
- break;
- }
-#endif
+ kernel_kexec();
+ unlock_kernel();
+ return -EINVAL;
+
#ifdef CONFIG_SOFTWARE_SUSPEND
case LINUX_REBOOT_CMD_SW_SUSPEND:
{
@@ -478,8 +504,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
static void deferred_cad(void *dummy)
{
- notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
- machine_restart(NULL);
+ kernel_restart(NULL);
}
/*
@@ -1686,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
long error;
- int sig;
error = security_task_prctl(option, arg2, arg3, arg4, arg5);
if (error)
@@ -1694,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
switch (option) {
case PR_SET_PDEATHSIG:
- sig = arg2;
- if (!valid_signal(sig)) {
+ if (!valid_signal(arg2)) {
error = -EINVAL;
break;
}
- current->pdeath_signal = sig;
+ current->pdeath_signal = arg2;
break;
case PR_GET_PDEATHSIG:
error = put_user(current->pdeath_signal, (int __user *)arg2);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 29196ce9b40f..1ab2370e2efa 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -79,7 +79,9 @@ cond_syscall(sys_request_key);
cond_syscall(sys_keyctl);
cond_syscall(compat_sys_keyctl);
cond_syscall(compat_sys_socketcall);
-cond_syscall(sys_set_zone_reclaim);
+cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_add_watch);
+cond_syscall(sys_inotify_rm_watch);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 270ee7fadbd8..8e56e2495542 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -31,6 +31,7 @@
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/kernel.h>
+#include <linux/net.h>
#include <linux/sysrq.h>
#include <linux/highuid.h>
#include <linux/writeback.h>
@@ -114,6 +115,7 @@ extern int unaligned_enabled;
extern int sysctl_ieee_emulation_warnings;
#endif
extern int sysctl_userprocess_debug;
+extern int spin_retry;
#endif
extern int sysctl_hz_timer;
@@ -135,9 +137,6 @@ static struct ctl_table_header root_table_header =
static ctl_table kern_table[];
static ctl_table vm_table[];
-#ifdef CONFIG_NET
-extern ctl_table net_table[];
-#endif
static ctl_table proc_table[];
static ctl_table fs_table[];
static ctl_table debug_table[];
@@ -146,6 +145,9 @@ extern ctl_table random_table[];
#ifdef CONFIG_UNIX98_PTYS
extern ctl_table pty_table[];
#endif
+#ifdef CONFIG_INOTIFY
+extern ctl_table inotify_table[];
+#endif
#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
int sysctl_legacy_va_layout;
@@ -218,6 +220,7 @@ static ctl_table root_table[] = {
.mode = 0555,
.child = dev_table,
},
+
{ .ctl_name = 0 }
};
@@ -643,7 +646,16 @@ static ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
-
+#if defined(CONFIG_ARCH_S390)
+ {
+ .ctl_name = KERN_SPIN_RETRY,
+ .procname = "spin_retry",
+ .data = &spin_retry,
+ .maxlen = sizeof (int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{ .ctl_name = 0 }
};
@@ -950,6 +962,14 @@ static ctl_table fs_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+#ifdef CONFIG_INOTIFY
+ {
+ .ctl_name = FS_INOTIFY,
+ .procname = "inotify",
+ .mode = 0555,
+ .child = inotify_table,
+ },
+#endif
#endif
{
.ctl_name = KERN_SETUID_DUMPABLE,
@@ -968,7 +988,7 @@ static ctl_table debug_table[] = {
static ctl_table dev_table[] = {
{ .ctl_name = 0 }
-};
+};
extern void init_irq_proc (void);
diff --git a/kernel/time.c b/kernel/time.c
index d4335c1c884c..dd5ae1162a8f 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -128,7 +128,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us
* as real UNIX machines always do it. This avoids all headaches about
* daylight saving times and warping kernel clocks.
*/
-inline static void warp_clock(void)
+static inline void warp_clock(void)
{
write_seqlock_irq(&xtime_lock);
wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
diff --git a/kernel/timer.c b/kernel/timer.c
index f2a11887a726..3ba10fa35b60 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
{
jiffies_64++;
update_times();
+ softlockup_tick(regs);
}
#ifdef __ARCH_WANT_SYS_ALARM
@@ -1023,7 +1024,7 @@ asmlinkage long sys_getppid(void)
parent = me->group_leader->real_parent;
for (;;) {
pid = parent->tgid;
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
{
struct task_struct *old = parent;
@@ -1150,9 +1151,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
out:
return timeout < 0 ? 0 : timeout;
}
-
EXPORT_SYMBOL(schedule_timeout);
+/*
+ * We can use __set_current_state() here because schedule_timeout() calls
+ * schedule() unconditionally.
+ */
+signed long __sched schedule_timeout_interruptible(signed long timeout)
+{
+ __set_current_state(TASK_INTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_interruptible);
+
+signed long __sched schedule_timeout_uninterruptible(signed long timeout)
+{
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ return schedule_timeout(timeout);
+}
+EXPORT_SYMBOL(schedule_timeout_uninterruptible);
+
/* Thread ID - the internal kernel "pid" */
asmlinkage long sys_gettid(void)
{
@@ -1169,8 +1187,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
if (!time_after(expire, now))
return 0;
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire - now);
+ expire = schedule_timeout_interruptible(expire - now);
ret = 0;
if (expire) {
@@ -1198,8 +1215,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
return -EINVAL;
expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
- current->state = TASK_INTERRUPTIBLE;
- expire = schedule_timeout(expire);
+ expire = schedule_timeout_interruptible(expire);
ret = 0;
if (expire) {
@@ -1428,7 +1444,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
}
}
-static inline u64 time_interpolator_get_counter(void)
+static inline u64 time_interpolator_get_counter(int writelock)
{
unsigned int src = time_interpolator->source;
@@ -1442,6 +1458,15 @@ static inline u64 time_interpolator_get_counter(void)
now = time_interpolator_get_cycles(src);
if (lcycle && time_after(lcycle, now))
return lcycle;
+
+ /* When holding the xtime write lock, there's no need
+ * to add the overhead of the cmpxchg. Readers are
+ * force to retry until the write lock is released.
+ */
+ if (writelock) {
+ time_interpolator->last_cycle = now;
+ return now;
+ }
/* Keep track of the last timer value returned. The use of cmpxchg here
* will cause contention in an SMP environment.
*/
@@ -1455,7 +1480,7 @@ static inline u64 time_interpolator_get_counter(void)
void time_interpolator_reset(void)
{
time_interpolator->offset = 0;
- time_interpolator->last_counter = time_interpolator_get_counter();
+ time_interpolator->last_counter = time_interpolator_get_counter(1);
}
#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
@@ -1467,7 +1492,7 @@ unsigned long time_interpolator_get_offset(void)
return 0;
return time_interpolator->offset +
- GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator);
+ GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
}
#define INTERPOLATOR_ADJUST 65536
@@ -1490,7 +1515,7 @@ static void time_interpolator_update(long delta_nsec)
* and the tuning logic insures that.
*/
- counter = time_interpolator_get_counter();
+ counter = time_interpolator_get_counter(1);
offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
@@ -1588,10 +1613,8 @@ void msleep(unsigned int msecs)
{
unsigned long timeout = msecs_to_jiffies(msecs) + 1;
- while (timeout) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- timeout = schedule_timeout(timeout);
- }
+ while (timeout)
+ timeout = schedule_timeout_uninterruptible(timeout);
}
EXPORT_SYMBOL(msleep);
@@ -1604,10 +1627,8 @@ unsigned long msleep_interruptible(unsigned int msecs)
{
unsigned long timeout = msecs_to_jiffies(msecs) + 1;
- while (timeout && !signal_pending(current)) {
- set_current_state(TASK_INTERRUPTIBLE);
- timeout = schedule_timeout(timeout);
- }
+ while (timeout && !signal_pending(current))
+ timeout = schedule_timeout_interruptible(timeout);
return jiffies_to_msecs(timeout);
}
diff --git a/kernel/user.c b/kernel/user.c
index 734575d55769..89e562feb1b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -120,6 +120,10 @@ struct user_struct * alloc_uid(uid_t uid)
atomic_set(&new->processes, 0);
atomic_set(&new->files, 0);
atomic_set(&new->sigpending, 0);
+#ifdef CONFIG_INOTIFY
+ atomic_set(&new->inotify_watches, 0);
+ atomic_set(&new->inotify_devs, 0);
+#endif
new->mq_bytes = 0;
new->locked_shm = 0;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 259cf55da3c9..91bacb13a7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,12 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
struct workqueue_struct *wq;
struct task_struct *p;
- BUG_ON(strlen(name) > 10);
-
- wq = kmalloc(sizeof(*wq), GFP_KERNEL);
+ wq = kzalloc(sizeof(*wq), GFP_KERNEL);
if (!wq)
return NULL;
- memset(wq, 0, sizeof(*wq));
wq->name = name;
/* We don't need the distraction of CPUs appearing and vanishing. */
@@ -501,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
/* Create a new workqueue thread for it. */
list_for_each_entry(wq, &workqueues, list) {
- if (create_workqueue_thread(wq, hotcpu) < 0) {
+ if (!create_workqueue_thread(wq, hotcpu)) {
printk("workqueue for %i failed\n", hotcpu);
return NOTIFY_BAD;
}