diff options
Diffstat (limited to 'kernel/watchdog.c')
| -rw-r--r-- | kernel/watchdog.c | 143 |
1 files changed, 137 insertions, 6 deletions
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 198137b1cadc..9472691c1eb0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -20,6 +20,7 @@ #include <linux/smpboot.h> #include <linux/sched/rt.h> #include <linux/tick.h> +#include <linux/workqueue.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -103,6 +104,11 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU +static cpumask_t __read_mostly watchdog_cpus; +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif static unsigned long soft_lockup_nmi_warn; @@ -114,7 +120,9 @@ static unsigned long soft_lockup_nmi_warn; #ifdef CONFIG_HARDLOCKUP_DETECTOR unsigned int __read_mostly hardlockup_panic = CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI static unsigned long hardlockup_allcpu_dumped; +#endif /* * We may not want to enable hard lockup detection by default in all cases, * for example when running the kernel as a guest on a hypervisor. In these @@ -225,7 +233,15 @@ static void __touch_watchdog(void) __this_cpu_write(watchdog_touch_ts, get_timestamp()); } -void touch_softlockup_watchdog(void) +/** + * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls + * + * Call when the scheduler may have stalled for legitimate reasons + * preventing the watchdog task from executing - e.g. the scheduler + * entering idle state. This should only be used for scheduler events. + * Use touch_softlockup_watchdog() for everything else. + */ +void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's timestamp @@ -233,6 +249,12 @@ void touch_softlockup_watchdog(void) */ raw_cpu_write(watchdog_touch_ts, 0); } + +void touch_softlockup_watchdog(void) +{ + touch_softlockup_watchdog_sched(); + wq_watchdog_touch(raw_smp_processor_id()); +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -246,6 +268,7 @@ void touch_all_softlockup_watchdogs(void) */ for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; + wq_watchdog_touch(-1); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -271,7 +294,7 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI /* watchdog detector functions */ static bool is_hardlockup(void) { @@ -285,6 +308,76 @@ static bool is_hardlockup(void) } #endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU +static unsigned int watchdog_next_cpu(unsigned int cpu) +{ + cpumask_t cpus = watchdog_cpus; + unsigned int next_cpu; + + next_cpu = cpumask_next(cpu, &cpus); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(&cpus); + + if (next_cpu == cpu) + return nr_cpu_ids; + + return next_cpu; +} + +static int is_hardlockup_other_cpu(unsigned int cpu) +{ + unsigned long hrint = per_cpu(hrtimer_interrupts, cpu); + + if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) + return 1; + + per_cpu(hrtimer_interrupts_saved, cpu) = hrint; + return 0; +} + +static void watchdog_check_hardlockup_other_cpu(void) +{ + unsigned int next_cpu; + + /* + * Test for hardlockups every 3 samples. The sample period is + * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over + * watchdog_thresh (over by 20%). + */ + if (__this_cpu_read(hrtimer_interrupts) % 3 != 0) + return; + + /* check for a hardlockup on the next cpu */ + next_cpu = watchdog_next_cpu(smp_processor_id()); + if (next_cpu >= nr_cpu_ids) + return; + + smp_rmb(); + + if (per_cpu(watchdog_nmi_touch, next_cpu) == true) { + per_cpu(watchdog_nmi_touch, next_cpu) = false; + return; + } + + if (is_hardlockup_other_cpu(next_cpu)) { + /* only warn once */ + if (per_cpu(hard_watchdog_warn, next_cpu) == true) + return; + + if (hardlockup_panic) + panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu); + else + WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu); + + per_cpu(hard_watchdog_warn, next_cpu) = true; + } else { + per_cpu(hard_watchdog_warn, next_cpu) = false; + } +} +#else +static inline void watchdog_check_hardlockup_other_cpu(void) { return; } +#endif + static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -297,7 +390,7 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, @@ -360,7 +453,7 @@ static void watchdog_overflow_callback(struct perf_event *event, __this_cpu_write(hard_watchdog_warn, false); return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */ static void watchdog_interrupt_count(void) { @@ -384,6 +477,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* kick the hardlockup detector */ watchdog_interrupt_count(); + /* test for hardlockups on the next cpu */ + watchdog_check_hardlockup_other_cpu(); + /* kick the softlockup detector */ wake_up_process(__this_cpu_read(softlockup_watchdog)); @@ -561,7 +657,7 @@ static void watchdog(unsigned int cpu) watchdog_nmi_disable(cpu); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI /* * People like the simple clean cpu node info on boot. * Reduce the watchdog noise by only printing messages @@ -660,9 +756,44 @@ static void watchdog_nmi_disable(unsigned int cpu) } #else +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU +static int watchdog_nmi_enable(unsigned int cpu) +{ + /* + * The new cpu will be marked online before the first hrtimer interrupt + * runs on it. If another cpu tests for a hardlockup on the new cpu + * before it has run its first hrtimer, it will get a false positive. + * Touch the watchdog on the new cpu to delay the first check for at + * least 3 sampling periods to guarantee one hrtimer has run on the new + * cpu. + */ + per_cpu(watchdog_nmi_touch, cpu) = true; + smp_wmb(); + cpumask_set_cpu(cpu, &watchdog_cpus); + return 0; +} + +static void watchdog_nmi_disable(unsigned int cpu) +{ + unsigned int next_cpu = watchdog_next_cpu(cpu); + + /* + * Offlining this cpu will cause the cpu before this one to start + * checking the one after this one. If this cpu just finished checking + * the next cpu and updating hrtimer_interrupts_saved, and then the + * previous cpu checks it within one sample period, it will trigger a + * false positive. Touch the watchdog on the next cpu to prevent it. + */ + if (next_cpu < nr_cpu_ids) + per_cpu(watchdog_nmi_touch, next_cpu) = true; + smp_wmb(); + cpumask_clear_cpu(cpu, &watchdog_cpus); +} +#else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */ static struct smp_hotplug_thread watchdog_threads = { .store = &softlockup_watchdog, |
