From 57ffc5ca679f499f4704fd9b6a372916f59930ee Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jun 2015 12:32:49 +0200 Subject: perf: Fix AUX buffer refcounting Its currently possible to drop the last refcount to the aux buffer from NMI context, which results in the expected fireworks. The refcounting needs a bigger overhaul, but to cure the immediate problem, delay the freeing by using an irq_work. Reviewed-and-tested-by: Alexander Shishkin Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150618103249.GK19282@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 -------- kernel/events/internal.h | 10 ++++++++++ kernel/events/ring_buffer.c | 27 +++++++++++++++++++++++++-- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e965cfae4207..d3dae3419b99 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4358,14 +4358,6 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_unlock(); } -static void rb_free_rcu(struct rcu_head *rcu_head) -{ - struct ring_buffer *rb; - - rb = container_of(rcu_head, struct ring_buffer, rcu_head); - rb_free(rb); -} - struct ring_buffer *ring_buffer_get(struct perf_event *event) { struct ring_buffer *rb; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2deb24c7a40d..2bbad9c1274c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,6 +11,7 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; + struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ @@ -55,6 +56,15 @@ struct ring_buffer { }; extern void rb_free(struct ring_buffer *rb); + +static inline void rb_free_rcu(struct rcu_head *rcu_head) +{ + struct ring_buffer *rb; + + rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb_free(rb); +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 96472824a752..b2be01b1aa9d 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -221,6 +221,8 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } +static void rb_irq_work(struct irq_work *work); + static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -241,6 +243,16 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); + init_irq_work(&rb->irq_work, rb_irq_work); +} + +static void ring_buffer_put_async(struct ring_buffer *rb) +{ + if (!atomic_dec_and_test(&rb->refcount)) + return; + + rb->rcu_head.next = (void *)rb; + irq_work_queue(&rb->irq_work); } /* @@ -319,7 +331,7 @@ err_put: rb_free_aux(rb); err: - ring_buffer_put(rb); + ring_buffer_put_async(rb); handle->event = NULL; return NULL; @@ -370,7 +382,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, local_set(&rb->aux_nest, 0); rb_free_aux(rb); - ring_buffer_put(rb); + ring_buffer_put_async(rb); } /* @@ -557,7 +569,18 @@ static void __rb_free_aux(struct ring_buffer *rb) void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) + irq_work_queue(&rb->irq_work); +} + +static void rb_irq_work(struct irq_work *work) +{ + struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); + + if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); + + if (rb->rcu_head.next == (void *)rb) + call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC -- cgit v1.2.3 From d49db342f0e276b354383b3281c5668b6b80f5c2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 24 Jun 2015 12:41:47 -0700 Subject: sched/fair: Test list head instead of list entry in throttle_cfs_rq() According to the comments, we need to test if this is the first throttled task, however, list_empty() tests on the entry cfs_rq->throttled_list, not the head, this is wrong. This is a bug because we don't re-init the list entry after removing it from the list, so list_empty() could return false even if the list is really empty. Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435174907-432-1-git-send-email-xiyou.wangcong@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 65c8f3ebdc3c..d113c3ba8bc4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3683,7 +3683,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); - empty = list_empty(&cfs_rq->throttled_list); + empty = list_empty(&cfs_b->throttled_cfs_rq); /* * Add to the _head_ of the list, so that an already-started -- cgit v1.2.3 From 7c4a976cd55972b68c75a978f171b6db5df4ce66 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 7 Jul 2015 10:14:35 +0200 Subject: clockevents: Allow set-state callbacks to be optional Its mandatory for the drivers to provide set_state_{oneshot|periodic}() (only if related modes are supported) and set_state_shutdown() callbacks today, if they are implementing the new set-state interface. But this leads to unnecessary noop callbacks for drivers which don't want to implement them. Over that, it will lead to a full function call for nothing really useful. Lets make all set-state callbacks optional. Suggested-by: Daniel Lezcano Signed-off-by: Viresh Kumar Signed-off-by: Daniel Lezcano Link: http://lkml.kernel.org/r/1436256875-15562-1-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 08ccc3da3ca0..50eb107f1198 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -120,19 +120,25 @@ static int __clockevents_switch_state(struct clock_event_device *dev, /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: - return dev->set_state_shutdown(dev); + if (dev->set_state_shutdown) + return dev->set_state_shutdown(dev); + return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; - return dev->set_state_periodic(dev); + if (dev->set_state_periodic) + return dev->set_state_periodic(dev); + return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; - return dev->set_state_oneshot(dev); + if (dev->set_state_oneshot) + return dev->set_state_oneshot(dev); + return 0; case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ @@ -471,18 +477,6 @@ static int clockevents_sanity_check(struct clock_event_device *dev) if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; - /* New state-specific callbacks */ - if (!dev->set_state_shutdown) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && - !dev->set_state_periodic) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && - !dev->set_state_oneshot) - return -EINVAL; - return 0; } -- cgit v1.2.3 From 8eb231261fdd20768db23863d00ef277de4b0543 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 14:11:00 +0200 Subject: tick/broadcast: Prevent hrtimer recursion The hrtimer based broadcast vehicle can cause a hrtimer recursion which went unnoticed until we changed the hrtimer expiry code to keep track of the currently running timer. local_timer_interrupt() local_handler() hrtimer_interrupt() expire_hrtimers() broadcast_hrtimer() send_ipis() local_handler() hrtimer_interrupt() .... Solution is simple: Prevent the local handler call from the broadcast code when the broadcast 'device' is hrtimer based. [ Split out from a larger combo patch ] Tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index d39f32cdd1b5..a76204089f78 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -265,8 +265,22 @@ static bool tick_do_broadcast(struct cpumask *mask) * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; + cpumask_clear_cpu(cpu, mask); - local = true; + /* + * We only run the local handler, if the broadcast + * device is not hrtimer based. Otherwise we run into + * a hrtimer recursion. + * + * local timer_interrupt() + * local_handler() + * expire_hrtimers() + * bc_handler() + * local_handler() + * expire_hrtimers() + */ + local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); } if (!cpumask_empty(mask)) { -- cgit v1.2.3 From e0454311903d3fd0f12a86c9e65d7b271c2bb05d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 14:07:27 +0200 Subject: tick/broadcast: Sanity check the shutdown of the local clock_event The broadcast code shuts down the local clock event unconditionally even if no broadcast device is installed or if the broadcast device is hrtimer based. Add proper sanity checks. [ Split out from a larger combo patch ] Reported-and-tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index a76204089f78..9877d0b0aefc 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -159,7 +159,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; - int ret; + int ret = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -221,13 +221,14 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) * If we kept the cpu in the broadcast mask, * tell the caller to leave the per cpu device * in shutdown state. The periodic interrupt - * is delivered by the broadcast device. + * is delivered by the broadcast device, if + * the broadcast device exists and is not + * hrtimer based. */ - ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); break; default: - /* Nothing to do */ - ret = 0; break; } } @@ -373,8 +374,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { - if (tick_broadcast_device.mode == - TICKDEV_MODE_PERIODIC) + /* + * Only shutdown the cpu local device, if: + * + * - the broadcast device exists + * - the broadcast device is not a hrtimer based one + * - the broadcast device is in periodic mode to + * avoid a hickup during switch to oneshot mode + */ + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && + tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } break; -- cgit v1.2.3 From f32dd117051185da6e923b35491a44d7debeeea5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:29:38 +0200 Subject: tick/broadcast: Make idle check independent from mode and config Currently the broadcast busy check, which prevents the idle code from going into deep idle, works only in one shot mode. If NOHZ and HIGHRES are off (config or command line) there is no sanity check at all, so under certain conditions cpus are allowed to go into deep idle, where the local timer stops, and are not woken up again because there is no broadcast timer installed or a hrtimer based broadcast device is not evaluated. Move tick_broadcast_oneshot_control() into the common code and provide proper subfunctions for the various config combinations. The common check in tick_broadcast_oneshot_control() is for the C3STOP misfeature flag of the local clock event device. If its not set, idle can proceed. If set, further checks are necessary. Provide checks for the trivial cases: - If broadcast is disabled in the config, then return busy - If oneshot mode (NOHZ/HIGHES) is disabled in the config, return busy if the broadcast device is hrtimer based. - If oneshot mode is enabled in the config call the original tick_broadcast_oneshot_control() function. That function needs extra checks which will be implemented in seperate patches. [ Split out from a larger combo patch ] Reported-and-tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 26 +++++++++++--------------- kernel/time/tick-common.c | 21 +++++++++++++++++++++ kernel/time/tick-sched.h | 10 ++++++++++ 3 files changed, 42 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 9877d0b0aefc..ef77b16ad5df 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -685,18 +685,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -/** - * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode - * @state: The target state (enter/exit) - * - * The system enters/leaves a state, where affected devices might stop - * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. - * - * Called with interrupts disabled, so clockevents_lock is not - * required here because the local clock event device cannot go away - * under us. - */ -int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; struct tick_device *td; @@ -717,9 +706,6 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; - raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; cpu = smp_processor_id(); @@ -961,6 +947,16 @@ bool tick_broadcast_oneshot_available(void) return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; } +#else +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return -EBUSY; + + return 0; +} #endif void __init tick_broadcast_init(void) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 76446cb5dfe1..55e13efff1ab 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -343,6 +343,27 @@ out_bc: tick_install_broadcast_device(newdev); } +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. + */ +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + return __tick_broadcast_oneshot_control(state); +} + #ifdef CONFIG_HOTPLUG_CPU /* * Transfer the do_timer job away from a dying cpu. diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 42fdf4958bcc..a4a8d4e9baa1 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu); static inline void tick_cancel_sched_timer(int cpu) { } #endif +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state); +#else +static inline int +__tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + return -EBUSY; +} +#endif + #endif -- cgit v1.2.3 From b78f3f3c898c824bf56ab55cfa59fc72be49c349 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:34:32 +0200 Subject: tick/broadcast: Prevent deep idle if no broadcast device available Add a check for a installed broadcast device to the oneshot control function and return busy if not. [ Split out from a larger combo patch ] Reported-and-tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ef77b16ad5df..fad3f789beec 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -692,6 +692,13 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) int cpu, ret = 0; ktime_t now; + /* + * If there is no broadcast device, tell the caller not to go + * into deep idle. + */ + if (!tick_broadcast_device.evtdev) + return -EBUSY; + /* * Periodic mode does not care about the enter/exit of power * states -- cgit v1.2.3 From e3ac79e087ffe8a1f953ed44a74acf7616cb0b25 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:38:11 +0200 Subject: tick/broadcast: Move the check for periodic mode inside state handling We need to check more than the periodic mode for proper operation in all runtime combinations. To avoid code duplication move the check into the enter state handling. No functional change. [ Split out from a larger combo patch ] Reported-and-tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index fad3f789beec..83aa92e87a85 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -688,7 +688,6 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; - struct tick_device *td; int cpu, ret = 0; ktime_t now; @@ -699,25 +698,20 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) if (!tick_broadcast_device.evtdev) return -EBUSY; - /* - * Periodic mode does not care about the enter/exit of power - * states - */ - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return 0; - - /* - * We are called with preemtion disabled from the depth of the - * idle code, so we can't be moved away. - */ - td = this_cpu_ptr(&tick_cpu_device); - dev = td->evtdev; + dev = this_cpu_ptr(&tick_cpu_device)->evtdev; raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; cpu = smp_processor_id(); if (state == TICK_BROADCAST_ENTER) { + /* + * If the broadcast device is in periodic mode, we + * return. + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + goto out; + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); broadcast_shutdown_local(bc, dev); -- cgit v1.2.3 From d33257264b0267a8fd20f6717abbb484c9e21130 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 17:45:22 +0200 Subject: tick/broadcast: Return busy if periodic mode and hrtimer broadcast If the system is in periodic mode and the broadcast device is hrtimer based, return busy as we have no proper handling for this. [ Split out from a larger combo patch ] Tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 83aa92e87a85..da7b40fdf6d0 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -709,8 +709,12 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) * If the broadcast device is in periodic mode, we * return. */ - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + /* If it is a hrtimer based broadcast, return busy */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) + ret = -EBUSY; goto out; + } if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); -- cgit v1.2.3 From 0cc5281aa592d0020868f6ccaed359b4ad7b2684 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:45:15 +0200 Subject: tick/broadcast: Return busy when IPI is pending Tell the idle code not to go deep if the broadcast IPI is about to arrive. [ Split out from a larger combo patch ] Tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index da7b40fdf6d0..70b47bc928b9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -725,11 +725,15 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) * if the cpu local event is earlier than the * broadcast event. If the current CPU is in * the force mask, then we are going to be - * woken by the IPI right away. + * woken by the IPI right away; we return + * busy, so the CPU does not try to go deep + * idle. */ - if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && - dev->next_event.tv64 < bc->next_event.tv64) + if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { + ret = -EBUSY; + } else if (dev->next_event.tv64 < bc->next_event.tv64) { tick_broadcast_set_event(bc, cpu, dev->next_event); + } } /* * If the current CPU owns the hrtimer broadcast -- cgit v1.2.3 From d5113e13a550bc9c2b53cc9944b8a06453c4a0a1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Jul 2015 16:43:04 +0200 Subject: tick/broadcast: Check for hrtimer broadcast active early If the current cpu is the one which has the hrtimer based broadcast queued then we better return busy immediately instead of going through loops and hoops to figure that out. [ Split out from a larger combo patch ] Tested-by: Sudeep Holla Signed-off-by: Thomas Gleixner Cc: Suzuki Poulose Cc: Lorenzo Pieralisi Cc: Catalin Marinas Cc: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: Ingo Molnar Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1507070929360.3916@nanos --- kernel/time/tick-broadcast.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 70b47bc928b9..c8d731ac9563 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -705,6 +705,17 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) cpu = smp_processor_id(); if (state == TICK_BROADCAST_ENTER) { + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we do not add + * the CPU to the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + goto out; + /* * If the broadcast device is in periodic mode, we * return. @@ -718,7 +729,10 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + + /* Conditionally shut down the local timer. */ broadcast_shutdown_local(bc, dev); + /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and @@ -733,18 +747,20 @@ int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) ret = -EBUSY; } else if (dev->next_event.tv64 < bc->next_event.tv64) { tick_broadcast_set_event(bc, cpu, dev->next_event); + /* + * In case of hrtimer broadcasts the + * programming might have moved the + * timer to this cpu. If yes, remove + * us from the broadcast mask and + * return busy. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) { + cpumask_clear_cpu(cpu, + tick_broadcast_oneshot_mask); + } } } - /* - * If the current CPU owns the hrtimer broadcast - * mechanism, it cannot go deep idle and we remove the - * CPU from the broadcast mask. We don't have to go - * through the EXIT path as the local timer is not - * shutdown. - */ - ret = broadcast_needs_cpu(bc, cpu); - if (ret) - cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); -- cgit v1.2.3 From c4288334818c81c946acb23d2319881f58c3d497 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 20:53:17 +0000 Subject: tick/broadcast: Handle spurious interrupts gracefully Andriy reported that on a virtual machine the warning about negative expiry time in the clock events programming code triggered: hpet: hpet0 irq 40 for MSI hpet: hpet1 irq 41 for MSI Switching to clocksource hpet WARNING: at kernel/time/clockevents.c:239 [] clockevents_program_event+0xdb/0xf0 [] tick_handle_periodic_broadcast+0x41/0x50 [] timer_interrupt+0x15/0x20 When the second hpet is installed as a per cpu timer the broadcast event is not longer required and stopped, which sets the next_evt of the broadcast device to KTIME_MAX. If after that a spurious interrupt happens on the broadcast device, then the current code blindly handles it and tries to reprogram the broadcast device afterwards, which adds the period to next_evt. KTIME_MAX + period results in a negative expiry value causing the WARN_ON in the clockevents code to trigger. Add a proper check for the state of the broadcast device into the interrupt handler and return if the interrupt is spurious. [ Folded in pointer fix from Sudeep ] Reported-by: Andriy Gapon Signed-off-by: Thomas Gleixner Cc: Sudeep Holla Cc: Peter Zijlstra Cc: Preeti U Murthy Link: http://lkml.kernel.org/r/20150705205221.802094647@linutronix.de --- kernel/time/tick-broadcast.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index c8d731ac9563..ee3cf942d6eb 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -316,6 +316,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) bool bc_local; raw_spin_lock(&tick_broadcast_lock); + + /* Handle spurious interrupts gracefully */ + if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { + raw_spin_unlock(&tick_broadcast_lock); + return; + } + bc_local = tick_do_periodic_broadcast(); if (clockevent_state_oneshot(dev)) { -- cgit v1.2.3 From a899418167264c7bac574b1a0f1b2c26c5b0995a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:30 +0000 Subject: hotplug: Prevent alloc/free of irq descriptors during cpu up/down When a cpu goes up some architectures (e.g. x86) have to walk the irq space to set up the vector space for the cpu. While this needs extra protection at the architecture level we can avoid a few race conditions by preventing the concurrent allocation/free of irq descriptors and the associated data. When a cpu goes down it moves the interrupts which are targeted to this cpu away by reassigning the affinities. While this happens interrupts can be allocated and freed, which opens a can of race conditions in the code which reassignes the affinities because interrupt descriptors might be freed underneath. Example: CPU1 CPU2 cpu_up/down irq_desc = irq_to_desc(irq); remove_from_radix_tree(desc); raw_spin_lock(&desc->lock); free(desc); We could protect the irq descriptors with RCU, but that would require a full tree change of all accesses to interrupt descriptors. But fortunately these kind of race conditions are rather limited to a few things like cpu hotplug. The normal setup/teardown is very well serialized. So the simpler and obvious solution is: Prevent allocation and freeing of interrupt descriptors accross cpu hotplug. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.063519515@linutronix.de --- kernel/cpu.c | 22 +++++++++++++++++++++- kernel/irq/internals.h | 4 ---- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 9c9c9fab16cc..6a374544d495 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include "smpboot.h" @@ -392,13 +393,19 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) smpboot_park_threads(cpu); /* - * So now all preempt/rcu users must observe !cpu_active(). + * Prevent irq alloc/free while the dying cpu reorganizes the + * interrupt affinities. */ + irq_lock_sparse(); + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + irq_unlock_sparse(); goto out_release; } BUG_ON(cpu_online(cpu)); @@ -415,6 +422,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ per_cpu(cpu_dead_idle, cpu) = false; + /* Interrupts are moved away from the dying cpu, reenable alloc/free */ + irq_unlock_sparse(); + hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); @@ -517,8 +527,18 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; } + /* + * Some architectures have to walk the irq descriptors to + * setup the vector space for the cpu which comes online. + * Prevent irq alloc/free across the bringup. + */ + irq_lock_sparse(); + /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); + + irq_unlock_sparse(); + if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4834ee828c41..61008b8433ab 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,12 +76,8 @@ extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } -extern void irq_lock_sparse(void); -extern void irq_unlock_sparse(void); #else extern void irq_mark_irq(unsigned int irq); -static inline void irq_lock_sparse(void) { } -static inline void irq_unlock_sparse(void) { } #endif extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -- cgit v1.2.3 From 6224beb12e190ff11f3c7d4bf50cb2922878f600 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 7 Jul 2015 15:05:03 -0400 Subject: tracing: Have branch tracer use recursive field of task struct Fengguang Wu's tests triggered a bug in the branch tracer's start up test when CONFIG_DEBUG_PREEMPT set. This was because that config adds some debug logic in the per cpu field, which calls back into the branch tracer. The branch tracer has its own recursive checks, but uses a per cpu variable to implement it. If retrieving the per cpu variable calls back into the branch tracer, you can see how things will break. Instead of using a per cpu variable, use the trace_recursion field of the current task struct. Simply set a bit when entering the branch tracing and clear it when leaving. If the bit is set on entry, just don't do the tracing. There's also the case with lockdep, as the local_irq_save() called before the recursion can also trigger code that can call back into the function. Changing that to a raw_local_irq_save() will protect that as well. This prevents the recursion and the inevitable crash that follows. Link: http://lkml.kernel.org/r/20150630141803.GA28071@wfg-t540p.sh.intel.com Cc: stable@vger.kernel.org # 3.10+ Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 1 + kernel/trace/trace_branch.c | 17 ++++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f060716b02ae..74bde81601a9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -444,6 +444,7 @@ enum { TRACE_CONTROL_BIT, + TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index a87b43f49eb4..e2e12ad3186f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -36,9 +36,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_branch *entry; struct ring_buffer *buffer; unsigned long flags; - int cpu, pc; + int pc; const char *p; + if (current->trace_recursion & TRACE_BRANCH_BIT) + return; + /* * I would love to save just the ftrace_likely_data pointer, but * this code can also be used by modules. Ugly things can happen @@ -49,10 +52,10 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) if (unlikely(!tr)) return; - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->trace_buffer.data, cpu); - if (atomic_inc_return(&data->disabled) != 1) + raw_local_irq_save(flags); + current->trace_recursion |= TRACE_BRANCH_BIT; + data = this_cpu_ptr(tr->trace_buffer.data); + if (atomic_read(&data->disabled)) goto out; pc = preempt_count(); @@ -81,8 +84,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) __buffer_unlock_commit(buffer, event); out: - atomic_dec(&data->disabled); - local_irq_restore(flags); + current->trace_recursion &= ~TRACE_BRANCH_BIT; + raw_local_irq_restore(flags); } static inline -- cgit v1.2.3 From 45820c294fe1b1a9df495d57f40585ef2d069a39 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Jul 2015 09:33:38 -0700 Subject: Fix broken audit tests for exec arg len The "fix" in commit 0b08c5e5944 ("audit: Fix check of return value of strnlen_user()") didn't fix anything, it broke things. As reported by Steven Rostedt: "Yes, strnlen_user() returns 0 on fault, but if you look at what len is set to, than you would notice that on fault len would be -1" because we just subtracted one from the return value. So testing against 0 doesn't test for a fault condition, it tests against a perfectly valid empty string. Also fix up the usual braindamage wrt using WARN_ON() inside a conditional - make it part of the conditional and remove the explicit unlikely() (which is already part of the WARN_ON*() logic, exactly so that you don't have to write unreadable code. Reported-and-tested-by: Steven Rostedt Cc: Jan Kara Cc: Paul Moore Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 09c65640cad6..e85bdfd15fed 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1021,8 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len == 0) || len > MAX_ARG_STRLEN - 1)) { - WARN_ON(1); + if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { send_sig(SIGKILL, current, 0); return -1; } -- cgit v1.2.3 From 758556bdc1c8a8dffea0ea9f9df891878cc2468c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Jul 2015 06:48:06 +0930 Subject: module: Fix load_module() error path The load_module() error path frees a module but forgot to take it out of the mod_tree, leaving a dangling entry in the tree, causing havoc. Cc: Mathieu Desnoyers Reported-by: Arthur Marsh Tested-by: Arthur Marsh Fixes: 93c2e105f6bc ("module: Optimize __module_address() using a latched RB-tree") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Rusty Russell --- kernel/module.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 3e0e19763d24..4d2b82e610e2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3557,6 +3557,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mod_tree_remove(mod); wake_up_all(&module_wq); /* Wait for RCU-sched synchronizing before releasing mod->list. */ synchronize_sched(); -- cgit v1.2.3 From c4d029f2d43b39de7b9299e8b58102a442ba86f8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 11 Jul 2015 14:26:34 +0200 Subject: tick/broadcast: Prevent NULL pointer dereference Dan reported that the recent changes to the broadcast code introduced a potential NULL dereference. Add the proper check. Fixes: e0454311903d "tick/broadcast: Sanity check the shutdown of the local clock_event" Reported-by: Dan Carpenter Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ee3cf942d6eb..52b9e199b5ac 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -409,14 +409,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) break; } - if (cpumask_empty(tick_broadcast_mask)) { - if (!bc_stopped) - clockevents_shutdown(bc); - } else if (bc_stopped) { - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - tick_broadcast_start_periodic(bc); - else - tick_broadcast_setup_oneshot(bc); + if (bc) { + if (cpumask_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } } raw_spin_unlock(&tick_broadcast_lock); } -- cgit v1.2.3 From 0f44705175347ec96935d60b765b5d14ecc763bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Jul 2015 12:01:04 +0200 Subject: tick: Move the export of tick_broadcast_oneshot_control to the proper place tick_broadcast_oneshot_control got moved from tick-broadcast to tick-common, but the export stayed in the old place. Fix it up. Fixes: f32dd1170511 'tick/broadcast: Make idle check independent from mode and config' Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 1 - kernel/time/tick-common.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 52b9e199b5ac..f6aae7977824 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -839,7 +839,6 @@ out: raw_spin_unlock(&tick_broadcast_lock); return ret; } -EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); /* * Reset the one shot broadcast for a cpu diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 55e13efff1ab..f8bf47571dda 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -363,6 +363,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) return __tick_broadcast_oneshot_control(state); } +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); #ifdef CONFIG_HOTPLUG_CPU /* -- cgit v1.2.3 From ce0d3c0a6fb1422101498ef378c0851dabbbf67f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 14 Jul 2015 22:03:57 +0200 Subject: genirq: Revert sparse irq locking around __cpu_up() and move it to x86 for now Boris reported that the sparse_irq protection around __cpu_up() in the generic code causes a regression on Xen. Xen allocates interrupts and some more in the xen_cpu_up() function, so it deadlocks on the sparse_irq_lock. There is no simple fix for this and we really should have the protection for all architectures, but for now the only solution is to move it to x86 where actual wreckage due to the lack of protection has been observed. Reported-and-tested-by: Boris Ostrovsky Fixes: a89941816726 'hotplug: Prevent alloc/free of irq descriptors during cpu up/down' Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Cc: xen-devel --- kernel/cpu.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6a374544d495..5644ec5582b9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -527,18 +527,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; } - /* - * Some architectures have to walk the irq descriptors to - * setup the vector space for the cpu which comes online. - * Prevent irq alloc/free across the bringup. - */ - irq_lock_sparse(); - /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); - irq_unlock_sparse(); - if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); -- cgit v1.2.3 From 75a06189fc508a2acf470b0b12710362ffb2c4b1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Jul 2015 14:10:17 +0200 Subject: genirq: Prevent resend to interrupts marked IRQ_NESTED_THREAD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The resend mechanism happily calls the interrupt handler of interrupts which are marked IRQ_NESTED_THREAD from softirq context. This can result in crashes because the interrupt handler is not the proper way to invoke the device handlers. They must be invoked via handle_nested_irq. Prevent the resend even if the interrupt has no valid parent irq set. Its better to have a lost interrupt than a crashing machine. Reported-by: Uwe Kleine-König Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/irq/resend.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 9065107f083e..7a5237a1bce5 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -75,13 +75,21 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* - * If the interrupt has a parent irq and runs - * in the thread context of the parent irq, - * retrigger the parent. + * If the interrupt is running in the thread + * context of the parent irq we need to be + * careful, because we cannot trigger it + * directly. */ - if (desc->parent_irq && - irq_settings_is_nested_thread(desc)) + if (irq_settings_is_nested_thread(desc)) { + /* + * If the parent_irq is valid, we + * retrigger the parent, otherwise we + * do nothing. + */ + if (!desc->parent_irq) + return; irq = desc->parent_irq; + } /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); tasklet_schedule(&resend_tasklet); -- cgit v1.2.3 From 0c8c0f03e3a292e031596484275c14cf39c0ab7a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 17 Jul 2015 12:28:11 +0200 Subject: x86/fpu, sched: Dynamically allocate 'struct fpu' The FPU rewrite removed the dynamic allocations of 'struct fpu'. But, this potentially wastes massive amounts of memory (2k per task on systems that do not have AVX-512 for instance). Instead of having a separate slab, this patch just appends the space that we need to the 'task_struct' which we dynamically allocate already. This saves from doing an extra slab allocation at fork(). The only real downside here is that we have to stick everything and the end of the task_struct. But, I think the BUILD_BUG_ON()s I stuck in there should keep that from being too fragile. Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1437128892-9831-2-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 1bfefc6f96a4..431b67a6098c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -287,15 +287,21 @@ static void set_max_threads(unsigned int max_threads_suggested) max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } +int __weak arch_task_struct_size(void) +{ + return sizeof(struct task_struct); +} + void __init fork_init(void) { + int task_struct_size = arch_task_struct_size(); #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = - kmem_cache_create("task_struct", sizeof(struct task_struct), + kmem_cache_create("task_struct", task_struct_size, ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); #endif -- cgit v1.2.3 From 5aaeb5c01c5b6c0be7b7aadbf3ace9f3a4458c3d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Jul 2015 12:28:12 +0200 Subject: x86/fpu, sched: Introduce CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT and use it on x86 Don't burden architectures without dynamic task_struct sizing with the overhead of dynamic sizing. Also optimize the x86 code a bit by caching task_struct_size. Acked-and-Tested-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1437128892-9831-3-git-send-email-mingo@kernel.org Signed-off-by: Ingo Molnar --- kernel/fork.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 431b67a6098c..dbd9b8d7b7cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -287,21 +287,20 @@ static void set_max_threads(unsigned int max_threads_suggested) max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } -int __weak arch_task_struct_size(void) -{ - return sizeof(struct task_struct); -} +#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT +/* Initialized by the architecture: */ +int arch_task_struct_size __read_mostly; +#endif void __init fork_init(void) { - int task_struct_size = arch_task_struct_size(); #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = - kmem_cache_create("task_struct", task_struct_size, + kmem_cache_create("task_struct", arch_task_struct_size, ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); #endif -- cgit v1.2.3