From dead9f29ddcc69551f35529a252d2704047870d3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 15 May 2015 12:15:21 -0700 Subject: perf: Fix race in BPF program unregister there is a race between perf_event_free_bpf_prog() and free_trace_kprobe(): __free_event() event->destroy(event) tp_perf_event_destroy() perf_trace_destroy() perf_trace_event_unreg() which is dropping event->tp_event->perf_refcount and allows to proceed in: unregister_trace_kprobe() unregister_kprobe_event() trace_remove_event_call() probe_remove_event_call() free_trace_kprobe() while __free_event does: call_rcu(&event->rcu_head, free_event_rcu); free_event_rcu() perf_event_free_bpf_prog() To fix the race simply move perf_event_free_bpf_prog() before event->destroy(), since event->tp_event is still valid at that point. Note, perf_trace_destroy() is not racing with trace_remove_event_call() since they both grab event_mutex. Reported-by: Wang Nan Signed-off-by: Alexei Starovoitov Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: lizefan@huawei.com Cc: pi3orama@163.com Fixes: 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") Link: http://lkml.kernel.org/r/1431717321-28772-1-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 1a3bf48743ce..eddf1ed4155e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3442,7 +3442,6 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); - perf_event_free_bpf_prog(event); kfree(event); } @@ -3573,6 +3572,8 @@ static void __free_event(struct perf_event *event) put_callchain_buffers(); } + perf_event_free_bpf_prog(event); + if (event->destroy) event->destroy(event); -- cgit v1.2.3 From aa319bcd366349c6f72fcd331da89d3d06090651 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 22 May 2015 18:30:20 +0300 Subject: perf: Disallow sparse AUX allocations for non-SG PMUs in overwrite mode PMUs that don't support hardware scatter tables require big contiguous chunks of memory and a PMI to switch between them. However, in overwrite using a PMI for this purpose adds extra overhead that the users would like to avoid. Thus, in overwrite mode for such PMUs we can only allow one contiguous chunk for the entire requested buffer. This patch changes the behavior accordingly, so that if the buddy allocator fails to come up with a single high-order chunk for the entire requested buffer, the allocation will fail. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: hpa@zytor.com Link: http://lkml.kernel.org/r/1432308626-18845-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 232f00f273cb..725c416085e3 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -493,6 +493,20 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, rb->aux_pages[rb->aux_nr_pages] = page_address(page++); } + /* + * In overwrite mode, PMUs that don't support SG may not handle more + * than one contiguous allocation, since they rely on PMI to do double + * buffering. In this case, the entire buffer has to be one contiguous + * chunk. + */ + if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && + overwrite) { + struct page *page = virt_to_page(rb->aux_pages[0]); + + if (page_private(page) != max_order) + goto out; + } + rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) -- cgit v1.2.3 From 9b7b819ca1e508195feed5ece558dca66adeef05 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 4 Jun 2015 23:57:18 +0200 Subject: compat: cleanup coding in compat_get_bitmap() and compat_put_bitmap() In the functions compat_get_bitmap() and compat_put_bitmap() the variable nr_compat_longs stores how many compat_ulong_t words should be copied in a loop. The copy loop itself is this: if (nr_compat_longs-- > 0) { if (__get_user(um, umask)) return -EFAULT; } else { um = 0; } Since nr_compat_longs gets unconditionally decremented in each loop and since it's type is unsigned this could theoretically lead to out of bounds accesses to userspace if nr_compat_longs wraps around to (unsigned)(-1). Although the callers currently do not trigger out-of-bounds accesses, we should better implement the loop in a safe way to completely avoid such warp-arounds. Signed-off-by: Helge Deller Cc: Linus Torvalds Cc: Al Viro --- kernel/compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 24f00610c575..333d364be29d 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -912,7 +912,8 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, * bitmap. We must however ensure the end of the * kernel bitmap is zeroed. */ - if (nr_compat_longs-- > 0) { + if (nr_compat_longs) { + nr_compat_longs--; if (__get_user(um, umask)) return -EFAULT; } else { @@ -954,7 +955,8 @@ long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, * We dont want to write past the end of the userspace * bitmap. */ - if (nr_compat_longs-- > 0) { + if (nr_compat_longs) { + nr_compat_longs--; if (__put_user(um, umask)) return -EFAULT; } -- cgit v1.2.3 From cee34d88cabd1ba5fc93e09b5b12232bc9338c7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2015 12:50:13 +0200 Subject: lockdep: Fix a race between /proc/lock_stat and module unload The lock_class iteration of /proc/lock_stat is not serialized against the lockdep_free_key_range() call from module unload. Therefore it can happen that we find a class of which ->name/->key are no longer valid. There is a further bug in zap_class() that left ->name dangling. Cure this. Use RCU_INIT_POINTER() because NULL. Since lockdep_free_key_range() is rcu_sched serialized, we can read both ->name and ->key under rcu_read_lock_sched() (preempt-disable) and be assured that if we observe a !NULL value it stays safe to use for as long as we hold that lock. If we observe both NULL, skip the entry. Reported-by: Jerome Marchand Tested-by: Jerome Marchand Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150602105013.GS3644@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 3 ++- kernel/locking/lockdep_proc.c | 22 +++++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index a0831e1b99f4..aaeae885d9af 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3900,7 +3900,8 @@ static void zap_class(struct lock_class *class) list_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); - class->key = NULL; + RCU_INIT_POINTER(class->key, NULL); + RCU_INIT_POINTER(class->name, NULL); } static inline int within(const void *addr, void *start, unsigned long size) diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index ef43ac4bafb5..d83d798bef95 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -426,10 +426,12 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt) static void seq_stats(struct seq_file *m, struct lock_stat_data *data) { - char name[39]; - struct lock_class *class; + struct lockdep_subclass_key *ckey; struct lock_class_stats *stats; + struct lock_class *class; + const char *cname; int i, namelen; + char name[39]; class = data->class; stats = &data->stats; @@ -440,15 +442,25 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data) if (class->subclass) namelen -= 2; - if (!class->name) { + rcu_read_lock_sched(); + cname = rcu_dereference_sched(class->name); + ckey = rcu_dereference_sched(class->key); + + if (!cname && !ckey) { + rcu_read_unlock_sched(); + return; + + } else if (!cname) { char str[KSYM_NAME_LEN]; const char *key_name; - key_name = __get_key_name(class->key, str); + key_name = __get_key_name(ckey, str); snprintf(name, namelen, "%s", key_name); } else { - snprintf(name, namelen, "%s", class->name); + snprintf(name, namelen, "%s", cname); } + rcu_read_unlock_sched(); + namelen = strlen(name); if (class->name_version > 1) { snprintf(name+namelen, 3, "#%d", class->name_version); -- cgit v1.2.3 From 8e76d4eecf7afeec9328e21cd5880e281838d0d6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 10 Jun 2015 11:15:00 -0700 Subject: sched, numa: do not hint for NUMA balancing on VM_MIXEDMAP mappings Jovi Zhangwei reported the following problem Below kernel vm bug can be triggered by tcpdump which mmaped a lot of pages with GFP_COMP flag. [Mon May 25 05:29:33 2015] page:ffffea0015414000 count:66 mapcount:1 mapping: (null) index:0x0 [Mon May 25 05:29:33 2015] flags: 0x20047580004000(head) [Mon May 25 05:29:33 2015] page dumped because: VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page)) [Mon May 25 05:29:33 2015] ------------[ cut here ]------------ [Mon May 25 05:29:33 2015] kernel BUG at mm/migrate.c:1661! [Mon May 25 05:29:33 2015] invalid opcode: 0000 [#1] SMP In this case it was triggered by running tcpdump but it's not necessary reproducible on all systems. sudo tcpdump -i bond0.100 'tcp port 4242' -c 100000000000 -w 4242.pcap Compound pages cannot be migrated and it was not expected that such pages be marked for NUMA balancing. This did not take into account that drivers such as net/packet/af_packet.c may insert compound pages into userspace with vm_insert_page. This patch tells the NUMA balancing protection scanner to skip all VM_MIXEDMAP mappings which avoids the possibility that compound pages are marked for migration. Signed-off-by: Mel Gorman Reported-by: Jovi Zhangwei Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ffeaa4105e48..c2980e8733bc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2181,7 +2181,7 @@ void task_numa_work(struct callback_head *work) } for (; vma; vma = vma->vm_next) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || - is_vm_hugetlb_page(vma)) { + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; } -- cgit v1.2.3 From 108029323910c5dd1ef8fa2d10da1ce5fbce6e12 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Wed, 10 Jun 2015 08:12:37 +0000 Subject: ring-buffer-benchmark: Fix the wrong sched_priority of producer The producer should be used producer_fifo as its sched_priority, so correct it. Link: http://lkml.kernel.org/r/1433923957-67842-1-git-send-email-long.wanglong@huawei.com Cc: stable@vger.kernel.org # 2.6.33+ Signed-off-by: Wang Long Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 13d945c0d03f..1b28df2d9104 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -450,7 +450,7 @@ static int __init ring_buffer_benchmark_init(void) if (producer_fifo >= 0) { struct sched_param param = { - .sched_priority = consumer_fifo + .sched_priority = producer_fifo }; sched_setscheduler(producer, SCHED_FIFO, ¶m); } else -- cgit v1.2.3