summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c15
-rw-r--r--kernel/futex.c98
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/sched/core.c38
-rw-r--r--kernel/sched/fair.c3
-rw-r--r--kernel/sysctl.c16
-rw-r--r--kernel/time/hrtimer.c1
-rw-r--r--kernel/trace/trace_irqsoff.c42
9 files changed, 204 insertions, 16 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 322f63370038..656d55d30f8d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5387,9 +5387,6 @@ static void perf_output_read_one(struct perf_output_handle *handle,
__output_copy(handle, values, n * sizeof(u64));
}
-/*
- * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
- */
static void perf_output_read_group(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
@@ -5434,6 +5431,13 @@ static void perf_output_read_group(struct perf_output_handle *handle,
#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
PERF_FORMAT_TOTAL_TIME_RUNNING)
+/*
+ * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
+ *
+ * The problem is that its both hard and excessively expensive to iterate the
+ * child list, not to mention that its impossible to IPI the children running
+ * on another CPU, from interrupt/NMI context.
+ */
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
@@ -8167,9 +8171,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
local64_set(&hwc->period_left, hwc->sample_period);
/*
- * we currently do not support PERF_FORMAT_GROUP on inherited events
+ * We currently do not support PERF_SAMPLE_READ on inherited events.
+ * See perf_output_read().
*/
- if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
+ if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
goto err_ns;
if (!has_branch_stack(event))
diff --git a/kernel/futex.c b/kernel/futex.c
index a09c1dd1f659..760a97da1050 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -470,6 +470,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
struct page *page, *page_head;
+ struct address_space *mapping;
int err, ro = 0;
/*
@@ -555,7 +556,19 @@ again:
}
#endif
- lock_page(page_head);
+ /*
+ * The treatment of mapping from this point on is critical. The page
+ * lock protects many things but in this context the page lock
+ * stabilizes mapping, prevents inode freeing in the shared
+ * file-backed region case and guards against movement to swap cache.
+ *
+ * Strictly speaking the page lock is not needed in all cases being
+ * considered here and page lock forces unnecessarily serialization
+ * From this point on, mapping will be re-verified if necessary and
+ * page lock will be acquired only if it is unavoidable
+ */
+
+ mapping = READ_ONCE(page_head->mapping);
/*
* If page_head->mapping is NULL, then it cannot be a PageAnon
@@ -572,18 +585,31 @@ again:
* shmem_writepage move it from filecache to swapcache beneath us:
* an unlikely race, but we do need to retry for page_head->mapping.
*/
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
+ if (unlikely(!mapping)) {
+ int shmem_swizzled;
+
+ /*
+ * Page lock is required to identify which special case above
+ * applies. If this is really a shmem page then the page lock
+ * will prevent unexpected transitions.
+ */
+ lock_page(page);
+ shmem_swizzled = PageSwapCache(page) || page->mapping;
unlock_page(page_head);
put_page(page_head);
+
if (shmem_swizzled)
goto again;
+
return -EFAULT;
}
/*
* Private mappings are handled in a simple way.
*
+ * If the futex key is stored on an anonymous page, then the associated
+ * object is the mm which is implicitly pinned by the calling process.
+ *
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
@@ -601,16 +627,74 @@ again:
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
+
+ get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
} else {
+ struct inode *inode;
+
+ /*
+ * The associated futex object in this case is the inode and
+ * the page->mapping must be traversed. Ordinarily this should
+ * be stabilised under page lock but it's not strictly
+ * necessary in this case as we just want to pin the inode, not
+ * update the radix tree or anything like that.
+ *
+ * The RCU read lock is taken as the inode is finally freed
+ * under RCU. If the mapping still matches expectations then the
+ * mapping->host can be safely accessed as being a valid inode.
+ */
+ rcu_read_lock();
+
+ if (READ_ONCE(page_head->mapping) != mapping) {
+ rcu_read_unlock();
+ put_page(page_head);
+
+ goto again;
+ }
+
+ inode = READ_ONCE(mapping->host);
+ if (!inode) {
+ rcu_read_unlock();
+ put_page(page_head);
+
+ goto again;
+ }
+
+ /*
+ * Take a reference unless it is about to be freed. Previously
+ * this reference was taken by ihold under the page lock
+ * pinning the inode in place so i_lock was unnecessary. The
+ * only way for this check to fail is if the inode was
+ * truncated in parallel so warn for now if this happens.
+ *
+ * We are not calling into get_futex_key_refs() in file-backed
+ * cases, therefore a successful atomic_inc return below will
+ * guarantee that get_futex_key() will still imply smp_mb(); (B).
+ */
+ if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+ rcu_read_unlock();
+ put_page(page_head);
+
+ goto again;
+ }
+
+ /* Should be impossible but lets be paranoid for now */
+ if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
+ err = -EFAULT;
+ rcu_read_unlock();
+ iput(inode);
+
+ goto out;
+ }
+
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
+ key->shared.inode = inode;
key->shared.pgoff = basepage_index(page);
+ rcu_read_unlock();
}
- get_futex_key_refs(key); /* implies MB (B) */
-
out:
- unlock_page(page_head);
put_page(page_head);
return err;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index b17263be9082..5fe7cdb6d05f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -322,8 +322,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
if (unlikely(is_child_reaper(pid))) {
- if (pid_ns_prepare_proc(ns))
+ if (pid_ns_prepare_proc(ns)) {
+ disable_pid_allocation(ns);
goto out_free;
+ }
}
get_pid_ns(ns);
diff --git a/kernel/resource.c b/kernel/resource.c
index c09d484f7b5f..73348f574163 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -611,7 +611,8 @@ static int __find_resource(struct resource *root, struct resource *old,
alloc.start = constraint->alignf(constraint->alignf_data, &avail,
size, constraint->align);
alloc.end = alloc.start + size - 1;
- if (resource_contains(&avail, &alloc)) {
+ if (alloc.start <= alloc.end &&
+ resource_contains(&avail, &alloc)) {
new->start = alloc.start;
new->end = alloc.end;
return 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fffc50b0191f..c1ecb07de762 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3268,9 +3268,24 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
+/*
+ * preemptoff stack tracing threshold in ns.
+ * default: 1ms
+ */
+unsigned int sysctl_preemptoff_tracing_threshold_ns = 1000000UL;
+
+struct preempt_store {
+ u64 ts;
+ unsigned long caddr[4];
+ bool irqs_disabled;
+};
+
+static DEFINE_PER_CPU(struct preempt_store, the_ps);
void preempt_count_add(int val)
{
+ struct preempt_store *ps = &per_cpu(the_ps, raw_smp_processor_id());
+
#ifdef CONFIG_DEBUG_PREEMPT
/*
* Underflow?
@@ -3291,6 +3306,13 @@ void preempt_count_add(int val)
#ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = ip;
#endif
+ ps->ts = sched_clock();
+ ps->caddr[0] = CALLER_ADDR0;
+ ps->caddr[1] = CALLER_ADDR1;
+ ps->caddr[2] = CALLER_ADDR2;
+ ps->caddr[3] = CALLER_ADDR3;
+ ps->irqs_disabled = irqs_disabled();
+
trace_preempt_off(CALLER_ADDR0, ip);
}
}
@@ -3313,8 +3335,22 @@ void preempt_count_sub(int val)
return;
#endif
- if (preempt_count() == val)
+ if (preempt_count() == val) {
+ struct preempt_store *ps = &per_cpu(the_ps,
+ raw_smp_processor_id());
+ u64 delta = sched_clock() - ps->ts;
+
+ /*
+ * Trace preempt disable stack if preemption
+ * is disabled for more than the threshold.
+ */
+ if (delta > sysctl_preemptoff_tracing_threshold_ns)
+ trace_sched_preempt_disable(delta, ps->irqs_disabled,
+ ps->caddr[0], ps->caddr[1],
+ ps->caddr[2], ps->caddr[3]);
+
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ }
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 23e37b0674df..f962ab1eb046 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2402,7 +2402,8 @@ void task_numa_work(struct callback_head *work)
return;
- down_read(&mm->mmap_sem);
+ if (!down_read_trylock(&mm->mmap_sem))
+ return;
vma = find_vma(mm, start);
if (!vma) {
reset_ptenuma_scan(p);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index bc4ca30ddc21..14f19af9d79a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -291,6 +291,22 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#if defined(CONFIG_PREEMPT_TRACER) || defined(CONFIG_IRQSOFF_TRACER)
+ {
+ .procname = "preemptoff_tracing_threshold_ns",
+ .data = &sysctl_preemptoff_tracing_threshold_ns,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "irqsoff_tracing_threshold_ns",
+ .data = &sysctl_irqsoff_tracing_threshold_ns,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#ifdef CONFIG_SCHED_HMP
{
.procname = "sched_freq_reporting_policy",
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index beafdf94b3b5..79fadcad21ff 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -435,6 +435,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
debug_object_free(timer, &hrtimer_debug_descr);
}
+EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
#else
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 21b162c07e83..c00137ea939e 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -13,6 +13,7 @@
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ftrace.h>
+#include <linux/sched/sysctl.h>
#include "trace.h"
@@ -39,6 +40,12 @@ static int save_flags;
static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
static int start_irqsoff_tracer(struct trace_array *tr, int graph);
+/*
+ * irqsoff stack tracing threshold in ns.
+ * default: 1ms
+ */
+unsigned int sysctl_irqsoff_tracing_threshold_ns = 1000000UL;
+
#ifdef CONFIG_PREEMPT_TRACER
static inline int
preempt_trace(void)
@@ -454,17 +461,52 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
#else /* !CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+struct irqsoff_store {
+ u64 ts;
+ unsigned long caddr[4];
+};
+
+static DEFINE_PER_CPU(struct irqsoff_store, the_irqsoff);
+#endif /* CONFIG_PREEMPTIRQ_EVENTS */
+
/*
* We are only interested in hardirq on/off events:
*/
static inline void tracer_hardirqs_on(void)
{
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+ struct irqsoff_store *is = &per_cpu(the_irqsoff,
+ raw_smp_processor_id());
+
+ if (!is->ts) {
+ is->ts = sched_clock();
+ is->caddr[0] = CALLER_ADDR0;
+ is->caddr[1] = CALLER_ADDR1;
+ is->caddr[2] = CALLER_ADDR2;
+ is->caddr[3] = CALLER_ADDR3;
+ }
+#endif /* CONFIG_PREEMPTIRQ_EVENTS */
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
static inline void tracer_hardirqs_off(void)
{
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+ struct irqsoff_store *is = &per_cpu(the_irqsoff,
+ raw_smp_processor_id());
+ u64 delta = 0;
+
+ if (is->ts) {
+ delta = sched_clock() - is->ts;
+ is->ts = 0;
+ }
+ if (delta > sysctl_irqsoff_tracing_threshold_ns)
+ trace_irqs_disable(delta, is->caddr[0], is->caddr[1],
+ is->caddr[2], is->caddr[3]);
+#endif /* CONFIG_PREEMPTIRQ_EVENTS */
+
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}