From 627371d73cdd04ed23fe098755b4f855138ad9e0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Jul 2006 05:16:20 +0200 Subject: [PATCH] pi-futex: robust-futex exit crash fix Fix pi_state->list handling bugs: list handling mishap, locking error. Plus add more debug checks and fix a few style issues i noticed while debugging this. (reported by Ulrich Drepper and Jakub Jelinek.) Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/futex.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index cf0c8e21d1ab..f59003b1d8f9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -415,15 +415,15 @@ out_unlock: */ void exit_pi_state_list(struct task_struct *curr) { - struct futex_hash_bucket *hb; struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; union futex_key key; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselfs + * versus waiters unqueueing themselves: */ spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { @@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; + hb = hash_futex(&key); spin_unlock_irq(&curr->pi_lock); - hb = hash_futex(&key); spin_lock(&hb->lock); spin_lock_irq(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ if (head->next != next) { spin_unlock(&hb->lock); continue; } - list_del_init(&pi_state->list); - WARN_ON(pi_state->owner != curr); - + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); pi_state->owner = NULL; spin_unlock_irq(&curr->pi_lock); @@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &me->key)) { + if (match_futex(&this->key, &me->key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: @@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) if (unlikely(!pi_state)) return -EINVAL; + WARN_ON(!atomic_read(&pi_state->refcount)); + atomic_inc(&pi_state->refcount); me->pi_state = pi_state; @@ -510,6 +515,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) pi_state->key = me->key; spin_lock_irq(&p->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; spin_unlock_irq(&p->pi_lock); @@ -584,9 +590,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (curval != uval) return -EINVAL; - list_del_init(&pi_state->owner->pi_state_list); + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + + spin_lock_irq(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; + spin_unlock_irq(&new_owner->pi_lock); + rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -1236,6 +1250,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, /* Owner died? */ if (q.pi_state->owner != NULL) { spin_lock_irq(&q.pi_state->owner->pi_lock); + WARN_ON(list_empty(&q.pi_state->list)); list_del_init(&q.pi_state->list); spin_unlock_irq(&q.pi_state->owner->pi_lock); } else @@ -1244,6 +1259,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, q.pi_state->owner = current; spin_lock_irq(¤t->pi_lock); + WARN_ON(!list_empty(&q.pi_state->list)); list_add(&q.pi_state->list, ¤t->pi_state_list); spin_unlock_irq(¤t->pi_lock); -- cgit v1.2.3 From e3f2ddeac718c768fdac4b7fe69d465172f788a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Jul 2006 05:17:57 +0200 Subject: [PATCH] pi-futex: robust-futex exit Fix robust PI-futexes to be properly unlocked on unexpected exit. For this to work the kernel has to know whether a futex is a PI or a non-PI one, because the semantics are different. Since the space in relevant glibc data structures is extremely scarce, the best solution is to encode the 'PI' information in bit 0 of the robust list pointer. Existing (non-PI) glibc robust futexes have this bit always zero, so the ABI is kept. New glibc with PI-robust-futexes will set this bit. Further fixes from Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Ulrich Drepper Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/futex.c | 91 +++++++++++++++++++++++++++++++++++---------------- kernel/futex_compat.c | 34 ++++++++++++++----- 2 files changed, 87 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f59003b1d8f9..dda2049692a2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -495,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) } /* - * We are the first waiter - try to look up the real owner and - * attach the new pi_state to it: + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when the owner died bit is set + * and TID = 0: */ pid = uval & FUTEX_TID_MASK; + if (!pid && (uval & FUTEX_OWNER_DIED)) + return -ESRCH; p = futex_find_get_task(pid); if (!p) return -ESRCH; @@ -579,16 +582,17 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * kept enabled while there is PI state around. We must also * preserve the owner died bit.) */ - newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; - - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + newval = FUTEX_WAITERS | new_owner->pid; - if (curval == -EFAULT) - return -EFAULT; - if (curval != uval) - return -EINVAL; + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + } spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); @@ -1443,9 +1447,11 @@ retry_locked: * again. If it succeeds then we can return without waking * anyone else up: */ - inc_preempt_count(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + } if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1478,9 +1484,11 @@ retry_locked: /* * No waiters - kernel unlocks the futex: */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; + if (!(uval & FUTEX_OWNER_DIED)) { + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + } out_unlock: spin_unlock(&hb->lock); @@ -1699,9 +1707,9 @@ err_unlock: * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval; + u32 uval, nval, mval; retry: if (get_user(uval, uaddr)) @@ -1718,20 +1726,44 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED); + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); + if (nval == -EFAULT) return -1; if (nval != uval) goto retry; - if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi) { + if (uval & FUTEX_WAITERS) + futex_wake(uaddr, 1); + } } return 0; } +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user **head, int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long *)head)) + return -EFAULT; + + *entry = (void *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -1742,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned long futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(entry, &head->list.next)) + if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: @@ -1760,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(pending, &head->list_op_pending)) + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; + if (pending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (entry != &head->list) { /* @@ -1772,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(entry, &entry->next)) + if (fetch_robust_entry(&entry, &entry->next, &pi)) return; /* * Avoid excessively long or circular lists: diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d1d92b441fb7..d1aab1a452cc 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -12,6 +12,23 @@ #include + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t *head, int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi; compat_uptr_t uentry, upending; - unsigned int limit = ROBUST_LIST_LIMIT; compat_long_t futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(uentry, &head->list.next)) + if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; - entry = compat_ptr(uentry); /* * Fetch the relative futex offset: */ @@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(upending, &head->list_op_pending)) + if (fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pi)) return; - pending = compat_ptr(upending); if (upending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pi); while (compat_ptr(uentry) != &head->list) { /* @@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(uentry, (compat_uptr_t *)&entry->next)) + if (fetch_robust_entry(&uentry, &entry, + (compat_uptr_t *)&entry->next, &pi)) return; - entry = compat_ptr(uentry); /* * Avoid excessively long or circular lists: */ -- cgit v1.2.3 From f712c0c7e1796f92e45e4de144e247816d974b8f Mon Sep 17 00:00:00 2001 From: "Siddha, Suresh B" Date: Sun, 30 Jul 2006 03:02:59 -0700 Subject: [PATCH] sched: build_sched_domains() fix Use the correct groups while initializing sched groups power for allnodes_domain. This fixes the crash observed while creating exclusive cpusets. Signed-off-by: Suresh Siddha Reported-and-tested-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b44b9a43b0fc..41a571806ce0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6494,7 +6494,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) for (i = 0; i < MAX_NUMNODES; i++) init_numa_sched_groups_power(sched_group_nodes[i]); - init_numa_sched_groups_power(sched_group_allnodes); + if (sched_group_allnodes) { + int group = cpu_to_allnodes_group(first_cpu(*cpu_map)); + struct sched_group *sg = &sched_group_allnodes[group]; + + init_numa_sched_groups_power(sg); + } #endif /* Attach the domains */ -- cgit v1.2.3 From 15a647eba94c3da27ccc666bea72e7cca06b2d19 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Sun, 30 Jul 2006 03:03:08 -0700 Subject: [PATCH] genirq: {en,dis}able_irq_wake() need refcounting too IRQs need refcounting and a state flag to track whether the the IRQ should be enabled or disabled as a "normal IRQ" source after a series of calls to {en,dis}able_irq(). For shared IRQs, the IRQ must be enabled so long as at least one driver needs it active. Likewise, IRQs need the same support to track whether the IRQ should be enabled or disabled as a "wakeup event" source after a series of calls to {en,dis}able_irq_wake(). For shared IRQs, the IRQ must be enabled as a wakeup source during sleep so long as at least one driver needs it. But right now they _don't have_ that refcounting ... which means sharing a wakeup-capable IRQ can't work correctly in some configurations. This patch adds the refcount and flag mechanisms to set_irq_wake() -- which is what {en,dis}able_irq_wake() call -- and minimal documentation of what the irq wake mechanism does. Drivers relying on the older (broken) "toggle" semantics will trigger a warning; that'll be a handful of drivers on ARM systems. Signed-off-by: David Brownell Acked-by: Ingo Molnar Acked-by: Thomas Gleixner Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4e461438e48b..92be519eff26 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq); * @irq: interrupt to control * @on: enable/disable power management wakeup * - * Enable/disable power management wakeup mode + * Enable/disable power management wakeup mode, which is + * disabled by default. Enables and disables must match, + * just as they match for non-wakeup mode support. + * + * Wakeup mode lets this IRQ wake the system from sleep + * states like "suspend to RAM". */ int set_irq_wake(unsigned int irq, unsigned int on) { struct irq_desc *desc = irq_desc + irq; unsigned long flags; int ret = -ENXIO; + int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake; + /* wakeup-capable irqs can be shared between drivers that + * don't need to have the same sleep mode behaviors. + */ spin_lock_irqsave(&desc->lock, flags); - if (desc->chip->set_wake) + if (on) { + if (desc->wake_depth++ == 0) + desc->status |= IRQ_WAKEUP; + else + set_wake = NULL; + } else { + if (desc->wake_depth == 0) { + printk(KERN_WARNING "Unbalanced IRQ %d " + "wake disable\n", irq); + WARN_ON(1); + } else if (--desc->wake_depth == 0) + desc->status &= ~IRQ_WAKEUP; + else + set_wake = NULL; + } + if (set_wake) ret = desc->chip->set_wake(irq, on); spin_unlock_irqrestore(&desc->lock, flags); return ret; -- cgit v1.2.3 From 7d94dddd438bcba97db44f120da39bb001b5249f Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Sun, 30 Jul 2006 03:03:10 -0700 Subject: [PATCH] make taskstats sending completely independent of delay accounting on/off status Complete the separation of delay accounting and taskstats by ignoring the return value of delay accounting functions that fill in parts of taskstats before it is sent out (either in response to a command or as part of a task exit). Also make delayacct_add_tsk return silently when delay accounting is turned off rather than treat it as an error. Signed-off-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f45179ce028e..b4c737a11408 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -177,7 +177,7 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) static int fill_pid(pid_t pid, struct task_struct *pidtsk, struct taskstats *stats) { - int rc; + int rc = 0; struct task_struct *tsk = pidtsk; if (!pidtsk) { @@ -196,12 +196,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, * Each accounting subsystem adds calls to its functions to * fill in relevant parts of struct taskstsats as follows * - * rc = per-task-foo(stats, tsk); - * if (rc) - * goto err; + * per-task-foo(stats, tsk); */ - rc = delayacct_add_tsk(stats, tsk); + delayacct_add_tsk(stats, tsk); stats->version = TASKSTATS_VERSION; /* Define err: label here if needed */ -- cgit v1.2.3 From d94a041519f3ab1ac023bf917619cd8c4a7d3c01 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Sun, 30 Jul 2006 03:03:11 -0700 Subject: [PATCH] taskstats: free skb, avoid returns in send_cpu_listeners Add a missing freeing of skb in the case there are no listeners at all. Also remove the returning of error values by the function as it is unused by the sole caller. Signed-off-by: Shailabh Nagar Signed-off-by: Chandra Seetharaman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/taskstats.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b4c737a11408..e78187657330 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -121,46 +121,45 @@ static int send_reply(struct sk_buff *skb, pid_t pid) /* * Send taskstats data in @skb to listeners registered for @cpu's exit data */ -static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); struct listener_list *listeners; struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); - int rc, ret, delcount = 0; + int rc, delcount = 0; rc = genlmsg_end(skb, reply); if (rc < 0) { nlmsg_free(skb); - return rc; + return; } rc = 0; listeners = &per_cpu(listener_array, cpu); down_read(&listeners->sem); - list_for_each_entry_safe(s, tmp, &listeners->list, list) { + list_for_each_entry(s, &listeners->list, list) { skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { skb_next = skb_clone(skb_cur, GFP_KERNEL); - if (!skb_next) { - nlmsg_free(skb_cur); - rc = -ENOMEM; + if (!skb_next) break; - } } - ret = genlmsg_unicast(skb_cur, s->pid); - if (ret == -ECONNREFUSED) { + rc = genlmsg_unicast(skb_cur, s->pid); + if (rc == -ECONNREFUSED) { s->valid = 0; delcount++; - rc = ret; } skb_cur = skb_next; } up_read(&listeners->sem); + if (skb_cur) + nlmsg_free(skb_cur); + if (!delcount) - return rc; + return; /* Delete invalidated entries */ down_write(&listeners->sem); @@ -171,7 +170,6 @@ static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) } } up_write(&listeners->sem); - return rc; } static int fill_pid(pid_t pid, struct task_struct *pidtsk, -- cgit v1.2.3 From 163ecdff060f2fa9e8f5238882fd0137493556a6 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Sun, 30 Jul 2006 03:03:11 -0700 Subject: [PATCH] delay accounting: temporarily enable by default Enable delay accounting by default so that feature gets coverage testing without requiring special measures. Earlier, it was off by default and had to be enabled via a boot time param. This patch reverses the default behaviour to improve coverage testing. It can be removed late in the kernel development cycle if its believed users shouldn't have to incur any cost if they don't want delay accounting. Or it can be retained forever if the utility of the stats is deemed common enough to warrant keeping the feature on. Signed-off-by: Shailabh Nagar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/delayacct.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/delayacct.c b/kernel/delayacct.c index f05392d64267..57ca3730205d 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -19,15 +19,15 @@ #include #include -int delayacct_on __read_mostly; /* Delay accounting turned on/off */ +int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ kmem_cache_t *delayacct_cache; -static int __init delayacct_setup_enable(char *str) +static int __init delayacct_setup_disable(char *str) { - delayacct_on = 1; + delayacct_on = 0; return 1; } -__setup("delayacct", delayacct_setup_enable); +__setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { -- cgit v1.2.3 From a9ad965ea9a6d719daf333847a2ceb0e363994bd Mon Sep 17 00:00:00 2001 From: "bibo, mao" Date: Sun, 30 Jul 2006 03:03:26 -0700 Subject: [PATCH] IA64: kprobe invalidate icache of jump buffer Kprobe inserts breakpoint instruction in probepoint and then jumps to instruction slot when breakpoint is hit, the instruction slot icache must be consistent with dcache. Here is the patch which invalidates instruction slot icache area. Without this patch, in some machines there will be fault when executing instruction slot where icache content is inconsistent with dcache. Signed-off-by: bibo,mao Acked-by: "Luck, Tony" Acked-by: Keshavamurthy Anil S Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 64aab081153b..3f57dfdc8f92 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { copy_kprobe(p, ap); + flush_insn_slot(ap); ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; -- cgit v1.2.3 From 8c78f3075dab4be279e283f901f00e33ce44890a Mon Sep 17 00:00:00 2001 From: Chandra Seetharaman Date: Sun, 30 Jul 2006 03:03:35 -0700 Subject: [PATCH] cpu hotplug: replace __devinit* with __cpuinit* for cpu notifications Few of the callback functions and notifier blocks that are associated with cpu notifications incorrectly have __devinit and __devinitdata. They should be __cpuinit and __cpuinitdata instead. It makes no functional difference but wastes text area when CONFIG_HOTPLUG is enabled and CONFIG_HOTPLUG_CPU is not. This patch fixes all those instances. Signed-off-by: Chandra Seetharaman Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hrtimer.c | 4 ++-- kernel/rcupdate.c | 4 ++-- kernel/softirq.c | 4 ++-- kernel/softlockup.c | 4 ++-- kernel/timer.c | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d17766d40dab..be989efc7856 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -835,7 +835,7 @@ static void migrate_hrtimers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit hrtimer_cpu_notify(struct notifier_block *self, +static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -859,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata hrtimers_nb = { +static struct notifier_block __cpuinitdata hrtimers_nb = { .notifier_call = hrtimer_cpu_notify, }; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 759805c9859a..436ab35f6fa7 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); } -static int __devinit rcu_cpu_notify(struct notifier_block *self, +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -565,7 +565,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata rcu_nb = { +static struct notifier_block __cpuinitdata rcu_nb = { .notifier_call = rcu_cpu_notify, }; diff --git a/kernel/softirq.c b/kernel/softirq.c index 0f08a84ae307..aab880677ce0 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -547,7 +547,7 @@ static void takeover_tasklets(unsigned int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit cpu_callback(struct notifier_block *nfb, +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -587,7 +587,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __devinitdata cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 6b76caa22981..03e6a2b0b787 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) /* * Create/destroy watchdog threads as CPUs come and go: */ -static int __devinit +static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_OK; } -static struct notifier_block __devinitdata cpu_nfb = { +static struct notifier_block __cpuinitdata cpu_nfb = { .notifier_call = cpu_callback }; diff --git a/kernel/timer.c b/kernel/timer.c index 05809c2e2fd6..ee319fc69f40 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1688,7 +1688,7 @@ static void __devinit migrate_timers(int cpu) } #endif /* CONFIG_HOTPLUG_CPU */ -static int __devinit timer_cpu_notify(struct notifier_block *self, +static int __cpuinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1708,7 +1708,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block __devinitdata timers_nb = { +static struct notifier_block __cpuinitdata timers_nb = { .notifier_call = timer_cpu_notify, }; -- cgit v1.2.3 From 6ea24f9ad18c65cc179593b5cc2a88cdadf8cc0c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 30 Jul 2006 03:03:38 -0700 Subject: [PATCH] fix bad macro param in timer.c We have #define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK and it's used via list = varray[i + 1]->vec + (INDEX(i + 1)); So, due to underparenthesisation, this INDEX(i+1) is now a ... (TVR_BITS + i + 1 * TVN_BITS)) ... So this bugfix changes behaviour. It worked before by sheer luck: "If i was anything but 0, it was broken. But this was only used by s390 and arm. Since it was for the next interrupt, could that next interrupt be a problem (going into the second cascade)? But it was probably seldom wrong. That is, this would fail if the next interrupt was in the second cascade, and was wrapped. Which may never of happened. Also if it did happen, it would have just missed the interrupt. If an interrupt was missed, and no one was there to miss it, was it really missed :-)" Signed-off-by: Steven Rostedt Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index ee319fc69f40..726caf9a0a69 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -408,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) * This function cascades all vectors and executes all expired timer * vectors. */ -#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) static inline void __run_timers(tvec_base_t *base) { -- cgit v1.2.3 From 2d7d253548cffdce80f4e03664686e9ccb1b0ed7 Mon Sep 17 00:00:00 2001 From: Jim Houston Date: Sun, 30 Jul 2006 03:03:39 -0700 Subject: [PATCH] fix cond_resched() fix In cond_resched_lock() it calls __resched_legal() before dropping the spin lock. __resched_legal() will always finds the preempt_count non-zero and will prevent the call to __cond_resched(). The attached patch adds a parameter to __resched_legal() with the expected preempt_count value. Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 41a571806ce0..de440b220b4c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4456,9 +4456,9 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline int __resched_legal(void) +static inline int __resched_legal(int expected_preempt_count) { - if (unlikely(preempt_count())) + if (unlikely(preempt_count() != expected_preempt_count)) return 0; if (unlikely(system_state != SYSTEM_RUNNING)) return 0; @@ -4484,7 +4484,7 @@ static void __cond_resched(void) int __sched cond_resched(void) { - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(0)) { __cond_resched(); return 1; } @@ -4510,7 +4510,7 @@ int cond_resched_lock(spinlock_t *lock) ret = 1; spin_lock(lock); } - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(1)) { spin_release(&lock->dep_map, 1, _THIS_IP_); _raw_spin_unlock(lock); preempt_enable_no_resched(); @@ -4526,7 +4526,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && __resched_legal()) { + if (need_resched() && __resched_legal(0)) { raw_local_irq_disable(); _local_bh_enable(); raw_local_irq_enable(); -- cgit v1.2.3 From 0fcb78c22f06340cdba884d7381adb3a0148bbb6 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Sun, 30 Jul 2006 03:03:42 -0700 Subject: [PATCH] Add DocBook documentation for workqueue functions kernel/workqueue.c was omitted from generating kernel documentation. This adds a new section "Workqueues and Kevents" and adds documentation for some of the functions. Some functions in this file already had DocBook-style comments, now they finally become visible. Signed-off-by: Rolf Eike Beer Cc: "Randy.Dunlap" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eebb1d839235..448e8f7b342d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, spin_unlock_irqrestore(&cwq->lock, flags); } -/* - * Queue work on a workqueue. Return non-zero if it was successfully - * added. +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns non-zero if it was successfully added. * * We queue the work to the CPU it was submitted, but there is no * guarantee that it will be processed by that CPU. @@ -128,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data) __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @work: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns non-zero if it was successfully added. + */ int fastcall queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) { @@ -150,6 +161,15 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work); +/** + * queue_delayed_work_on - queue work on specific CPU after delay + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns non-zero if it was successfully added. + */ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work, unsigned long delay) { @@ -275,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) } } -/* +/** * flush_workqueue - ensure that any scheduled work has run to completion. + * @wq: workqueue to flush * * Forces execution of the workqueue and blocks until its completion. * This is typically used in driver shutdown handlers. @@ -400,6 +421,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) kthread_stop(p); } +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ void destroy_workqueue(struct workqueue_struct *wq) { int cpu; @@ -425,18 +452,41 @@ EXPORT_SYMBOL_GPL(destroy_workqueue); static struct workqueue_struct *keventd_wq; +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * This puts a job in the kernel-global workqueue. + */ int fastcall schedule_work(struct work_struct *work) { return queue_work(keventd_wq, work); } EXPORT_SYMBOL(schedule_work); +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @work: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) { return queue_delayed_work(keventd_wq, work, delay); } EXPORT_SYMBOL(schedule_delayed_work); +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @work: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay) { -- cgit v1.2.3 From b50f60ceeef2e38e529737c0260d9543939915ad Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 30 Jul 2006 03:03:52 -0700 Subject: [PATCH] pi-futex: missing pi_waiters plist initialization Initialize init task's pi_waiters plist. Otherwise cpu hotplug of cpu 0 might crash, since rt_mutex_getprio() accesses an uninitialized list head. call chain which led to crash: take_cpu_down sched_idle_next __setscheduler rt_mutex_getprio Using PLIST_HEAD_INIT in the INIT_TASK macro doesn't work unfortunately, since the pi_waiters member is only conditionally present. Cc: Arjan van de Ven Cc: Thomas Gleixner Acked-by: Ingo Molnar Signed-off-by: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index de440b220b4c..a2be2d055299 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6766,6 +6766,11 @@ void __init sched_init(void) } set_load_weight(&init_task); + +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); +#endif + /* * The boot idle thread does lazy MMU switching as well: */ -- cgit v1.2.3 From 3c829c367a1a52550378584a657768217971e587 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Sun, 30 Jul 2006 03:04:02 -0700 Subject: [PATCH] Reducing local_bh_enable/disable overhead in irqtrace The recent changes from irqtrace feature has added overheads to local_bh_disable and local_bh_enable that reduces UDP performance across x86_64 and IA64, even though IA64 does not support the irqtrace feature. Patch in question is [PATCH]lockdep: irqtrace subsystem, core http://www.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=c ommit;h=de30a2b355ea85350ca2f58f3b9bf4e5bc007986 Prior to this patch, local_bh_disable was a short macro. Now it is a function which calls __local_bh_disable with added irq flags save and restore. The irq flags save and restore were also added to local_bh_enable, probably for injecting the trace irqs code. This overhead is on the generic code path across all architectures. On a IA_64 test machine (Itanium-2 1.6 GHz) running a benchmark like netperf's UDP streaming test, the added overhead results in a drop of 3% in throughput, as udp_sendmsg calls the local_bh_enable/disable several times. Other workloads that have heavy usages of local_bh_enable/disable could also be affected. The patch ideally should not have affected IA-64 performance as it does not have IRQ tracing support. A significant portion of the overhead is in the added irq flags save and restore, which I think is not needed if IRQ tracing is unused. A suggested patch is attached below that recovers the lost performance. However, the "ifdef"s in the patch are a bit ugly. Signed-off-by: Tim Chen Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index aab880677ce0..3789ca98197c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -65,6 +65,7 @@ static inline void wakeup_softirqd(void) * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ +#ifdef CONFIG_TRACE_IRQFLAGS static void __local_bh_disable(unsigned long ip) { unsigned long flags; @@ -80,6 +81,13 @@ static void __local_bh_disable(unsigned long ip) trace_softirqs_off(ip); raw_local_irq_restore(flags); } +#else /* !CONFIG_TRACE_IRQFLAGS */ +static inline void __local_bh_disable(unsigned long ip) +{ + add_preempt_count(SOFTIRQ_OFFSET); + barrier(); +} +#endif /* CONFIG_TRACE_IRQFLAGS */ void local_bh_disable(void) { @@ -121,12 +129,16 @@ EXPORT_SYMBOL(_local_bh_enable); void local_bh_enable(void) { +#ifdef CONFIG_TRACE_IRQFLAGS unsigned long flags; WARN_ON_ONCE(in_irq()); +#endif WARN_ON_ONCE(irqs_disabled()); +#ifdef CONFIG_TRACE_IRQFLAGS local_irq_save(flags); +#endif /* * Are softirqs going to be turned on now: */ @@ -142,18 +154,22 @@ void local_bh_enable(void) do_softirq(); dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS local_irq_restore(flags); +#endif preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable); void local_bh_enable_ip(unsigned long ip) { +#ifdef CONFIG_TRACE_IRQFLAGS unsigned long flags; WARN_ON_ONCE(in_irq()); local_irq_save(flags); +#endif /* * Are softirqs going to be turned on now: */ @@ -169,7 +185,9 @@ void local_bh_enable_ip(unsigned long ip) do_softirq(); dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS local_irq_restore(flags); +#endif preempt_check_resched(); } EXPORT_SYMBOL(local_bh_enable_ip); -- cgit v1.2.3 From d07fe82c24daab2360e2790f488bcffa7db74825 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sun, 30 Jul 2006 03:04:03 -0700 Subject: [PATCH] reference rt-mutex-design in rtmutex.c In order to prevent Doc Rot, this patch adds a reference to the design document for rtmutex.c in rtmutex.c. So when someone needs to update or change the design of that file they will know that a document actually exists that explains the design (helping them change it), and hopefully that they will update the document if they too change the design. Signed-off-by: Steven Rostedt Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/rtmutex.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index d2ef13b485e7..3e13a1e5856f 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -7,6 +7,8 @@ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen + * + * See Documentation/rt-mutex-design.txt for details. */ #include #include -- cgit v1.2.3 From 51d8c5edd3b166fcc51aba84d78761d578400a7c Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 30 Jul 2006 03:04:14 -0700 Subject: [PATCH] timer: Fix tvec_bases initializer kernel/timer.c defines a (per-cpu) pointer to tvec_base_t, but initializes it using { &a_tvec_base_t }, which sparse warns about; change this to just &a_tvec_base_t. Signed-off-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 726caf9a0a69..b650f04888ed 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t; tvec_base_t boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases }; +static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; static inline void set_running_timer(tvec_base_t *base, struct timer_list *timer) -- cgit v1.2.3 From ae74c3b69a08e1de20cb681ec959f3a48af0006a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 2 Aug 2006 20:17:49 -0700 Subject: Fix force_sig_info() semantics after cleanups Suresh points out that commit b0423a0d9cc836b2c3d796623cd19236bfedfe63 broke the semantics of a synchronous signal like SIGSEGV occurring recursively inside its own handler handler (or, indeed, any other context when the signal was blocked). That was unintentional, and this fixes things up by reinstating the old semantics, but without reverting the cleanups. Cc: Paul E. McKenney Acked-by: Suresh Siddha Signed-off-by: Linus Torvalds --- kernel/signal.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7fe874d12fae..bfdb5686fa3e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -791,22 +791,31 @@ out: /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. + * + * Note: If we unblock the signal, we always reset it to SIG_DFL, + * since we do not want to have a signal handler that was blocked + * be invoked when user space had explicitly blocked it. + * + * We don't want to have recursive SIGSEGV's etc, for example. */ - int force_sig_info(int sig, struct siginfo *info, struct task_struct *t) { unsigned long int flags; - int ret; + int ret, blocked, ignored; + struct k_sigaction *action; spin_lock_irqsave(&t->sighand->siglock, flags); - if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { - t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; - } - if (sigismember(&t->blocked, sig)) { - sigdelset(&t->blocked, sig); + action = &t->sighand->action[sig-1]; + ignored = action->sa.sa_handler == SIG_IGN; + blocked = sigismember(&t->blocked, sig); + if (blocked || ignored) { + action->sa.sa_handler = SIG_DFL; + if (blocked) { + sigdelset(&t->blocked, sig); + recalc_sigpending_tsk(t); + } } - recalc_sigpending_tsk(t); ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); -- cgit v1.2.3 From 3e2efce067cec0099f99ae59f28feda99b02b498 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 13 Jul 2006 13:16:02 -0400 Subject: [PATCH] fix faulty inode data collection for open() with O_CREAT When the specified path is an existing file or when it is a symlink, audit collects the wrong inode number, which causes it to miss the open() event. Adding a second hook to the open() path fixes this. Also add audit_copy_inode() to consolidate some code. Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/auditsc.c | 63 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ae40ac8c39e7..b939ed2da3ee 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1199,14 +1199,18 @@ void audit_putname(const char *name) #endif } -static void audit_inode_context(int idx, const struct inode *inode) +/* Copy inode data into an audit_names. */ +static void audit_copy_inode(struct audit_names *name, const struct inode *inode) { - struct audit_context *context = current->audit_context; - - selinux_get_inode_sid(inode, &context->names[idx].osid); + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + selinux_get_inode_sid(inode, &name->osid); } - /** * audit_inode - store the inode and device from a lookup * @name: name being audited @@ -1240,13 +1244,7 @@ void __audit_inode(const char *name, const struct inode *inode) ++context->ino_count; #endif } - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; - audit_inode_context(idx, inode); + audit_copy_inode(&context->names[idx], inode); } /** @@ -1302,16 +1300,37 @@ update_context: context->names[idx].name_len = AUDIT_NAME_FULL; context->names[idx].name_put = 0; /* don't call __putname() */ - if (inode) { - context->names[idx].ino = inode->i_ino; - context->names[idx].dev = inode->i_sb->s_dev; - context->names[idx].mode = inode->i_mode; - context->names[idx].uid = inode->i_uid; - context->names[idx].gid = inode->i_gid; - context->names[idx].rdev = inode->i_rdev; - audit_inode_context(idx, inode); - } else - context->names[idx].ino = (unsigned long)-1; + if (!inode) + context->names[idx].ino = (unsigned long)-1; + else + audit_copy_inode(&context->names[idx], inode); +} + +/** + * audit_inode_update - update inode info for last collected name + * @inode: inode being audited + * + * When open() is called on an existing object with the O_CREAT flag, the inode + * data audit initially collects is incorrect. This additional hook ensures + * audit has the inode data for the actual object to be opened. + */ +void __audit_inode_update(const struct inode *inode) +{ + struct audit_context *context = current->audit_context; + int idx; + + if (!context->in_syscall || !inode) + return; + + if (context->name_count == 0) { + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + } + idx = context->name_count - 1; + + audit_copy_inode(&context->names[idx], inode); } /** -- cgit v1.2.3 From 73d3ec5abad3f1730ac8530899d2c14d92f3ad63 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 13 Jul 2006 13:16:39 -0400 Subject: [PATCH] fix missed create event for directory audit When an object is created via a symlink into an audited directory, audit misses the event due to not having collected the inode data for the directory. Modify __audit_inode_child() to copy the parent inode data if a parent wasn't found in audit_names[]. Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/auditsc.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b939ed2da3ee..b1356fc63b26 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1251,7 +1251,7 @@ void __audit_inode(const char *name, const struct inode *inode) * audit_inode_child - collect inode info for created/removed objects * @dname: inode's dentry name * @inode: inode being audited - * @pino: inode number of dentry parent + * @parent: inode of dentry parent * * For syscalls that create or remove filesystem objects, audit_inode * can only collect information for the filesystem object's parent. @@ -1262,7 +1262,7 @@ void __audit_inode(const char *name, const struct inode *inode) * unsuccessful attempts. */ void __audit_inode_child(const char *dname, const struct inode *inode, - unsigned long pino) + const struct inode *parent) { int idx; struct audit_context *context = current->audit_context; @@ -1276,7 +1276,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode, if (!dname) goto update_context; for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].ino == pino) { + if (context->names[idx].ino == parent->i_ino) { const char *name = context->names[idx].name; if (!name) @@ -1304,6 +1304,16 @@ update_context: context->names[idx].ino = (unsigned long)-1; else audit_copy_inode(&context->names[idx], inode); + + /* A parent was not found in audit_names, so copy the inode data for the + * provided parent. */ + if (!found_name) { + idx = context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + audit_copy_inode(&context->names[idx], parent); + } } /** -- cgit v1.2.3 From 6988434ee5f532c71be3131fba23283f5cf43847 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 13 Jul 2006 13:17:12 -0400 Subject: [PATCH] fix oops with CONFIG_AUDIT and !CONFIG_AUDITSYSCALL Always initialize the audit_inode_hash[] so we don't oops on list rules. Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/audit.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d417ca1db79b..0a36091ed712 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -690,9 +690,7 @@ static const struct inotify_operations audit_inotify_ops = { /* Initialize audit support at boot time. */ static int __init audit_init(void) { -#ifdef CONFIG_AUDITSYSCALL int i; -#endif printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); @@ -717,10 +715,10 @@ static int __init audit_init(void) audit_ih = inotify_init(&audit_inotify_ops); if (IS_ERR(audit_ih)) audit_panic("cannot initialize inotify handle"); +#endif for (i = 0; i < AUDIT_INODE_BUCKETS; i++) INIT_LIST_HEAD(&audit_inode_hash[i]); -#endif return 0; } -- cgit v1.2.3 From 5422e01ac16df7398b2bad1eccad0ae3be4dee32 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Tue, 1 Aug 2006 17:52:26 -0400 Subject: [PATCH] fix audit oops with invalid operator Michael C Thompson wrote: [Tue Aug 01 2006, 02:36:36PM EDT] > The trigger for this oops is: > # auditctl -a exit,always -S pread64 -F 'inode<1' Setting the err value will fix it. Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/auditfilter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 5b4e16276ca0..32420f914028 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -442,6 +442,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) case AUDIT_EQUAL: break; default: + err = -EINVAL; goto exit_free; } } @@ -579,6 +580,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, case AUDIT_EQUAL: break; default: + err = -EINVAL; goto exit_free; } } -- cgit v1.2.3 From 471a5c7c839114cc8b55876203aeb2817c33e3c5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 10 Jul 2006 08:29:24 -0400 Subject: [PATCH] introduce audit rules counter Signed-off-by: Al Viro --- kernel/auditfilter.c | 24 ++++++++++++++++++++++++ kernel/auditsc.c | 3 +++ 2 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 32420f914028..6a9a5c5a4e7d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1136,6 +1136,14 @@ static inline int audit_add_rule(struct audit_entry *entry, struct audit_watch *watch = entry->rule.watch; struct nameidata *ndp, *ndw; int h, err, putnd_needed = 0; +#ifdef CONFIG_AUDITSYSCALL + int dont_count = 0; + + /* If either of these, don't count towards total */ + if (entry->rule.listnr == AUDIT_FILTER_USER || + entry->rule.listnr == AUDIT_FILTER_TYPE) + dont_count = 1; +#endif if (inode_f) { h = audit_hash_ino(inode_f->val); @@ -1176,6 +1184,10 @@ static inline int audit_add_rule(struct audit_entry *entry, } else { list_add_tail_rcu(&entry->list, list); } +#ifdef CONFIG_AUDITSYSCALL + if (!dont_count) + audit_n_rules++; +#endif mutex_unlock(&audit_filter_mutex); if (putnd_needed) @@ -1200,6 +1212,14 @@ static inline int audit_del_rule(struct audit_entry *entry, struct audit_watch *watch, *tmp_watch = entry->rule.watch; LIST_HEAD(inotify_list); int h, ret = 0; +#ifdef CONFIG_AUDITSYSCALL + int dont_count = 0; + + /* If either of these, don't count towards total */ + if (entry->rule.listnr == AUDIT_FILTER_USER || + entry->rule.listnr == AUDIT_FILTER_TYPE) + dont_count = 1; +#endif if (inode_f) { h = audit_hash_ino(inode_f->val); @@ -1237,6 +1257,10 @@ static inline int audit_del_rule(struct audit_entry *entry, list_del_rcu(&e->list); call_rcu(&e->rcu, audit_free_rule_rcu); +#ifdef CONFIG_AUDITSYSCALL + if (!dont_count) + audit_n_rules--; +#endif mutex_unlock(&audit_filter_mutex); if (!list_empty(&inotify_list)) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b1356fc63b26..3ea836d3d941 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -85,6 +85,9 @@ extern int audit_enabled; /* Indicates that audit should log the full pathname. */ #define AUDIT_NAME_FULL -1 +/* number of audit rules */ +int audit_n_rules; + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). -- cgit v1.2.3 From d51374adf5f2f88155a072d3d801104e3c0c3d7f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 3 Aug 2006 10:59:26 -0400 Subject: [PATCH] mark context of syscall entered with no rules as dummy Signed-off-by: Al Viro --- kernel/auditsc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 3ea836d3d941..9618d1507251 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -177,6 +177,7 @@ struct audit_aux_data_path { /* The per-task audit context. */ struct audit_context { + int dummy; /* must be the first element */ int in_syscall; /* 1 if task is in a syscall */ enum audit_state state; unsigned int serial; /* serial number for record */ @@ -517,7 +518,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, context->return_valid = return_valid; context->return_code = return_code; - if (context->in_syscall && !context->auditable) { + if (context->in_syscall && !context->dummy && !context->auditable) { enum audit_state state; state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); @@ -1069,7 +1070,8 @@ void audit_syscall_entry(int arch, int major, context->argv[3] = a4; state = context->state; - if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) + context->dummy = !audit_n_rules; + if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)) state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); if (likely(state == AUDIT_DISABLED)) return; -- cgit v1.2.3 From 5ac3a9c26c1cc4861d9cdd8b293fecbfcdc81afe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Jul 2006 06:38:45 -0400 Subject: [PATCH] don't bother with aux entires for dummy context Signed-off-by: Al Viro --- kernel/auditsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9618d1507251..f571c7e925e6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1676,7 +1676,7 @@ int audit_bprm(struct linux_binprm *bprm) unsigned long p, next; void *to; - if (likely(!audit_enabled || !context)) + if (likely(!audit_enabled || !context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, @@ -1714,7 +1714,7 @@ int audit_socketcall(int nargs, unsigned long *args) struct audit_aux_data_socketcall *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) + if (likely(!context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); @@ -1742,7 +1742,7 @@ int audit_sockaddr(int len, void *a) struct audit_aux_data_sockaddr *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) + if (likely(!context || context->dummy)) return 0; ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); -- cgit v1.2.3 From 3f2792ffbd88dc1cd41d226674cc428914981e98 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Jul 2006 06:43:48 -0400 Subject: [PATCH] take filling ->pid, etc. out of audit_get_context() move that stuff downstream and into the only branch where it'll be used. Signed-off-by: Al Viro --- kernel/auditsc.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index f571c7e925e6..efc1b74bebf3 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -534,17 +534,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, } get_context: - context->pid = tsk->pid; - context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ - context->uid = tsk->uid; - context->gid = tsk->gid; - context->euid = tsk->euid; - context->suid = tsk->suid; - context->fsuid = tsk->fsuid; - context->egid = tsk->egid; - context->sgid = tsk->sgid; - context->fsgid = tsk->fsgid; - context->personality = tsk->personality; + tsk->audit_context = NULL; return context; } @@ -753,6 +743,17 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts const char *tty; /* tsk == current */ + context->pid = tsk->pid; + context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ + context->uid = tsk->uid; + context->gid = tsk->gid; + context->euid = tsk->euid; + context->suid = tsk->suid; + context->fsuid = tsk->fsuid; + context->egid = tsk->egid; + context->sgid = tsk->sgid; + context->fsgid = tsk->fsgid; + context->personality = tsk->personality; ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); if (!ab) -- cgit v1.2.3 From a7ef7878ea7c8bca9b624db3f61223cdadda2a0a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 5 Aug 2006 12:13:42 -0700 Subject: [PATCH] Make suspend possible with a traced process at a breakpoint It should be possible to suspend, either to RAM or to disk, if there's a traced process that has just reached a breakpoint. However, this is a special case, because its parent process might have been frozen already and then we are unable to deliver the "freeze" signal to the traced process. If this happens, it's better to cancel the freezing of the traced process. Ref. http://bugzilla.kernel.org/show_bug.cgi?id=6787 Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/process.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index b2a5f671d6cd..72e72d2c61e6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -66,13 +66,25 @@ static inline void freeze_process(struct task_struct *p) } } +static void cancel_freezing(struct task_struct *p) +{ + unsigned long flags; + + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + do_not_freeze(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_tsk(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} + /* 0 = success, else # of processes that we failed to stop */ int freeze_processes(void) { int todo, nr_user, user_frozen; unsigned long start_time; struct task_struct *g, *p; - unsigned long flags; printk( "Stopping tasks: " ); start_time = jiffies; @@ -85,6 +97,10 @@ int freeze_processes(void) continue; if (frozen(p)) continue; + if (p->state == TASK_TRACED && frozen(p->parent)) { + cancel_freezing(p); + continue; + } if (p->mm && !(p->flags & PF_BORROWED_MM)) { /* The task is a user-space one. * Freeze it unless there's a vfork completion @@ -126,13 +142,7 @@ int freeze_processes(void) do_each_thread(g, p) { if (freezeable(p) && !frozen(p)) printk(KERN_ERR " %s\n", p->comm); - if (freezing(p)) { - pr_debug(" clean up: %s\n", p->comm); - p->flags &= ~PF_FREEZE; - spin_lock_irqsave(&p->sighand->siglock, flags); - recalc_sigpending_tsk(p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - } + cancel_freezing(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); return todo; -- cgit v1.2.3 From e91467ecd1ef381377fd327c0ded922835ec52ab Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Sat, 5 Aug 2006 12:13:52 -0700 Subject: [PATCH] bug in futex unqueue_me This patch adds a barrier() in futex unqueue_me to avoid aliasing of two pointers. On my s390x system I saw the following oops: Unable to handle kernel pointer dereference at virtual kernel address 0000000000000000 Oops: 0004 [#1] CPU: 0 Not tainted Process mytool (pid: 13613, task: 000000003ecb6ac0, ksp: 00000000366bdbd8) Krnl PSW : 0704d00180000000 00000000003c9ac2 (_spin_lock+0xe/0x30) Krnl GPRS: 00000000ffffffff 000000003ecb6ac0 0000000000000000 0700000000000000 0000000000000000 0000000000000000 000001fe00002028 00000000000c091f 000001fe00002054 000001fe00002054 0000000000000000 00000000366bddc0 00000000005ef8c0 00000000003d00e8 0000000000144f91 00000000366bdcb8 Krnl Code: ba 4e 20 00 12 44 b9 16 00 3e a7 84 00 08 e3 e0 f0 88 00 04 Call Trace: ([<0000000000144f90>] unqueue_me+0x40/0xe4) [<0000000000145a0c>] do_futex+0x33c/0xc40 [<000000000014643e>] sys_futex+0x12e/0x144 [<000000000010bb00>] sysc_noemu+0x10/0x16 [<000002000003741c>] 0x2000003741c The code in question is: static int unqueue_me(struct futex_q *q) { int ret = 0; spinlock_t *lock_ptr; /* In the common case we don't take the spinlock, which is nice. */ retry: lock_ptr = q->lock_ptr; if (lock_ptr != 0) { spin_lock(lock_ptr); /* * q->lock_ptr can change between reading it and * spin_lock(), causing us to take the wrong lock. This * corrects the race condition. [...] and my compiler (gcc 4.1.0) makes the following out of it: 00000000000003c8 : 3c8: eb bf f0 70 00 24 stmg %r11,%r15,112(%r15) 3ce: c0 d0 00 00 00 00 larl %r13,3ce 3d0: R_390_PC32DBL .rodata+0x2a 3d4: a7 f1 1e 00 tml %r15,7680 3d8: a7 84 00 01 je 3da 3dc: b9 04 00 ef lgr %r14,%r15 3e0: a7 fb ff d0 aghi %r15,-48 3e4: b9 04 00 b2 lgr %r11,%r2 3e8: e3 e0 f0 98 00 24 stg %r14,152(%r15) 3ee: e3 c0 b0 28 00 04 lg %r12,40(%r11) /* write q->lock_ptr in r12 */ 3f4: b9 02 00 cc ltgr %r12,%r12 3f8: a7 84 00 4b je 48e /* if r12 is zero then jump over the code.... */ 3fc: e3 20 b0 28 00 04 lg %r2,40(%r11) /* write q->lock_ptr in r2 */ 402: c0 e5 00 00 00 00 brasl %r14,402 404: R_390_PC32DBL _spin_lock+0x2 /* use r2 as parameter for spin_lock */ So the code becomes more or less: if (q->lock_ptr != 0) spin_lock(q->lock_ptr) instead of if (lock_ptr != 0) spin_lock(lock_ptr) Which caused the oops from above. After adding a barrier gcc creates code without this problem: [...] (the same) 3ee: e3 c0 b0 28 00 04 lg %r12,40(%r11) 3f4: b9 02 00 cc ltgr %r12,%r12 3f8: b9 04 00 2c lgr %r2,%r12 3fc: a7 84 00 48 je 48c 400: c0 e5 00 00 00 00 brasl %r14,400 402: R_390_PC32DBL _spin_lock+0x2 As a general note, this code of unqueue_me seems a bit fishy. The retry logic of unqueue_me only works if we can guarantee, that the original value of q->lock_ptr is always a spinlock (Otherwise we overwrite kernel memory). We know that q->lock_ptr can change. I dont know what happens with the original spinlock, as I am not an expert with the futex code. Cc: Martin Schwidefsky Cc: Rusty Russell Acked-by: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index dda2049692a2..c2b2e0b83abf 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -948,6 +948,7 @@ static int unqueue_me(struct futex_q *q) /* In the common case we don't take the spinlock, which is nice. */ retry: lock_ptr = q->lock_ptr; + barrier(); if (lock_ptr != 0) { spin_lock(lock_ptr); /* -- cgit v1.2.3 From 9f59ce5d0e0dd837853385927b150f5cef3a7f52 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert <76306.1226@compuserve.com> Date: Sat, 5 Aug 2006 12:14:11 -0700 Subject: [PATCH] ptrace: make pid of child process available for PTRACE_EVENT_VFORK_DONE When delivering PTRACE_EVENT_VFORK_DONE, provide pid of the child process when tracer calls ptrace(PTRACE_GETEVENTMSG). This is already (accidentally) available when the tracer is tracing VFORK in addition to VFORK_DONE. Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com> Cc: Daniel Jacobowitz Cc: Albert Cahalan Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1b0f7b1e0881..aa36c43783cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1387,8 +1387,10 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { wait_for_completion(&vfork); - if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) + if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { + current->ptrace_message = nr; ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); + } } } else { free_pid(pid); -- cgit v1.2.3 From 78944e549d36673eb6265a2411574e79c28e23dc Mon Sep 17 00:00:00 2001 From: "Antonino A. Daplas" Date: Sat, 5 Aug 2006 12:14:16 -0700 Subject: [PATCH] vt: printk: Fix framebuffer console triggering might_sleep assertion Reported by: Dave Jones Whilst printk'ing to both console and serial console, I got this... (2.6.18rc1) BUG: sleeping function called from invalid context at kernel/sched.c:4438 in_atomic():0, irqs_disabled():1 Call Trace: [] show_trace+0xaa/0x23d [] dump_stack+0x15/0x17 [] __might_sleep+0xb2/0xb4 [] __cond_resched+0x15/0x55 [] cond_resched+0x3b/0x42 [] console_conditional_schedule+0x12/0x14 [] fbcon_redraw+0xf6/0x160 [] fbcon_scroll+0x5d9/0xb52 [] scrup+0x6b/0xd6 [] lf+0x24/0x44 [] vt_console_print+0x166/0x23d [] __call_console_drivers+0x65/0x76 [] _call_console_drivers+0x5e/0x62 [] release_console_sem+0x14b/0x232 [] fb_flashcursor+0x279/0x2a6 [] run_workqueue+0xa8/0xfb [] worker_thread+0xef/0x122 [] kthread+0x100/0x136 [] child_rip+0x8/0x12 This can occur when release_console_sem() is called but the log buffer still has contents that need to be flushed. The console drivers are called while the console_may_schedule flag is still true. The might_sleep() is triggered when fbcon calls console_conditional_schedule(). Fix by setting console_may_schedule to zero earlier, before the call to the console drivers. Signed-off-by: Antonino Daplas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 65ca0688f86f..1149365e989e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -799,6 +799,9 @@ void release_console_sem(void) up(&secondary_console_sem); return; } + + console_may_schedule = 0; + for ( ; ; ) { spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; @@ -812,7 +815,6 @@ void release_console_sem(void) local_irq_restore(flags); } console_locked = 0; - console_may_schedule = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) { -- cgit v1.2.3 From 0f04ab5efbca73ab366a156d96b073d2da35b158 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sat, 5 Aug 2006 12:14:59 -0700 Subject: [PATCH] memory hotadd fixes: change find_next_system_ram's return value manner find_next_system_ram() returns valid memory range which meets requested area, only used by memory-hot-add. This function always rewrite requested resource even if returned area is not fully fit in requested one. And sometimes the returnd resource is larger than requested area. This annoyes the caller. This patch changes the returned value to fit in requested area. Signed-off-by: KAMEZAWA Hiroyuki Cc: Keith Mannthey Cc: Yasunori Goto Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 0dd3a857579e..63e879379dbd 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -261,8 +261,10 @@ int find_next_system_ram(struct resource *res) if (!p) return -1; /* copy data */ - res->start = p->start; - res->end = p->end; + if (res->start < p->start) + res->start = p->start; + if (res->end > p->end) + res->end = p->end; return 0; } #endif -- cgit v1.2.3 From 58c1b5b079071d82b2f924000b7e8fb5585ce7d8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Sat, 5 Aug 2006 12:15:01 -0700 Subject: [PATCH] memory hotadd fixes: find_next_system_ram catch range fix find_next_system_ram() is used to find available memory resource at onlining newly added memory. This patch fixes following problem. find_next_system_ram() cannot catch this case. Resource: (start)-------------(end) Section : (start)-------------(end) Signed-off-by: KAMEZAWA Hiroyuki Cc: Keith Mannthey Cc: Yasunori Goto Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 63e879379dbd..46286434af80 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -244,6 +244,7 @@ int find_next_system_ram(struct resource *res) start = res->start; end = res->end; + BUG_ON(start >= end); read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { @@ -254,7 +255,7 @@ int find_next_system_ram(struct resource *res) p = NULL; break; } - if (p->start >= start) + if ((p->end >= start) && (p->start < end)) break; } read_unlock(&resource_lock); -- cgit v1.2.3 From ce2c6b53847afc444c4d0a7a1075c61f499c57a5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 5 Aug 2006 12:15:15 -0700 Subject: [PATCH] futex: Apply recent futex fixes to futex_compat The recent fixups in futex.c need to be applied to futex_compat.c too. Fixes a hang reported by Olaf. Signed-off-by: Thomas Gleixner Cc: Olaf Hering Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex_compat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d1aab1a452cc..c5cca3f65cb7 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -39,7 +39,7 @@ void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; compat_uptr_t uentry, upending; compat_long_t futex_offset; @@ -59,10 +59,10 @@ void compat_exit_robust_list(struct task_struct *curr) * if it exists: */ if (fetch_robust_entry(&upending, &pending, - &head->list_op_pending, &pi)) + &head->list_op_pending, &pip)) return; if (upending) - handle_futex_death((void *)pending + futex_offset, curr, pi); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (compat_ptr(uentry) != &head->list) { /* -- cgit v1.2.3