From b416d439eca0cf42482816cdc4ef6a8ab5a5ac2e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Sat, 21 Mar 2020 18:22:41 -0700 Subject: x86/mm: split vmalloc_sync_all() commit 763802b53a427ed3cbd419dbba255c414fdd9e7c upstream. Commit 3f8fd02b1bf1 ("mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy()") introduced a call to vmalloc_sync_all() in the vunmap() code-path. While this change was necessary to maintain correctness on x86-32-pae kernels, it also adds additional cycles for architectures that don't need it. Specifically on x86-64 with CONFIG_VMAP_STACK=y some people reported severe performance regressions in micro-benchmarks because it now also calls the x86-64 implementation of vmalloc_sync_all() on vunmap(). But the vmalloc_sync_all() implementation on x86-64 is only needed for newly created mappings. To avoid the unnecessary work on x86-64 and to gain the performance back, split up vmalloc_sync_all() into two functions: * vmalloc_sync_mappings(), and * vmalloc_sync_unmappings() Most call-sites to vmalloc_sync_all() only care about new mappings being synchronized. The only exception is the new call-site added in the above mentioned commit. Shile Zhang directed us to a report of an 80% regression in reaim throughput. Fixes: 3f8fd02b1bf1 ("mm/vmalloc: Sync unmappings in __purge_vmap_area_lazy()") Reported-by: kernel test robot Reported-by: Shile Zhang Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Tested-by: Borislav Petkov Acked-by: Rafael J. Wysocki [GHES] Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Link: http://lkml.kernel.org/r/20191009124418.8286-1-joro@8bytes.org Link: https://lists.01.org/hyperkitty/list/lkp@lists.01.org/thread/4D3JPPHBNOSPFK2KEPC6KGKS6J25AIDB/ Link: http://lkml.kernel.org/r/20191113095530.228959-1-shile.zhang@linux.alibaba.com Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/notifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/notifier.c b/kernel/notifier.c index fd2c9acbcc19..0f70f1b6fdaa 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -552,7 +552,7 @@ NOKPROBE_SYMBOL(notify_die); int register_die_notifier(struct notifier_block *nb) { - vmalloc_sync_all(); + vmalloc_sync_mappings(); return atomic_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL_GPL(register_die_notifier); -- cgit v1.2.3 From 24bbfe34bb44c036c3a0874bf74fc2387d5557bf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 4 Mar 2020 11:28:31 +0100 Subject: futex: Fix inode life-time issue commit 8019ad13ef7f64be44d4f892af9c840179009254 upstream. As reported by Jann, ihold() does not in fact guarantee inode persistence. And instead of making it so, replace the usage of inode pointers with a per boot, machine wide, unique inode identifier. This sequence number is global, but shared (file backed) futexes are rare enough that this should not become a performance issue. Reported-by: Jann Horn Suggested-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 89 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 53 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 15d850ffbe29..1aa3acdce484 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -407,7 +407,7 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies MB (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: futex_get_mm(key); /* implies MB (B) */ @@ -438,7 +438,6 @@ static void drop_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - iput(key->shared.inode); break; case FUT_OFF_MMSHARED: mmdrop(key->private.mm); @@ -446,6 +445,46 @@ static void drop_futex_key_refs(union futex_key *key) } } +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that match_futex() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + /** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex @@ -458,9 +497,15 @@ static void drop_futex_key_refs(union futex_key *key) * * The key words are stored in *key on success. * - * For shared mappings, it's (page->index, file_inode(vma->vm_file), - * offset_within_page). For private mappings, it's (uaddr, current->mm). - * We can usually work out the index without swapping in the page. + * For shared mappings (when @fshared), the key is: + * ( inode->i_sequence, page->index, offset_within_page ) + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. * * lock_page() might sleep, the caller should not hold a spinlock. */ @@ -628,8 +673,6 @@ again: key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies smp_mb(); (B) */ - } else { struct inode *inode; @@ -661,40 +704,14 @@ again: goto again; } - /* - * Take a reference unless it is about to be freed. Previously - * this reference was taken by ihold under the page lock - * pinning the inode in place so i_lock was unnecessary. The - * only way for this check to fail is if the inode was - * truncated in parallel which is almost certainly an - * application bug. In such a case, just retry. - * - * We are not calling into get_futex_key_refs() in file-backed - * cases, therefore a successful atomic_inc return below will - * guarantee that get_futex_key() will still imply smp_mb(); (B). - */ - if (!atomic_inc_not_zero(&inode->i_count)) { - rcu_read_unlock(); - put_page(page_head); - - goto again; - } - - /* Should be impossible but lets be paranoid for now */ - if (WARN_ON_ONCE(inode->i_mapping != mapping)) { - err = -EFAULT; - rcu_read_unlock(); - iput(inode); - - goto out; - } - key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = inode; + key->shared.i_seq = get_inode_sequence_number(inode); key->shared.pgoff = basepage_index(page); rcu_read_unlock(); } + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + out: put_page(page_head); return err; -- cgit v1.2.3 From 9679d496df60a33696755b76dc79b42671b0737f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 8 Mar 2020 19:07:17 +0100 Subject: futex: Unbreak futex hashing commit 8d67743653dce5a0e7aa500fcccb237cde7ad88e upstream. The recent futex inode life time fix changed the ordering of the futex key union struct members, but forgot to adjust the hash function accordingly, As a result the hashing omits the leading 64bit and even hashes beyond the futex key causing a bad hash distribution which led to a ~100% performance regression. Hand in the futex key pointer instead of a random struct member and make the size calculation based of the struct offset. Fixes: 8019ad13ef7f ("futex: Fix inode life-time issue") Reported-by: Rong Chen Decoded-by: Linus Torvalds Signed-off-by: Thomas Gleixner Tested-by: Rong Chen Link: https://lkml.kernel.org/r/87h7yy90ve.fsf@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 1aa3acdce484..a322303b4d75 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -378,9 +378,9 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb) */ static struct futex_hash_bucket *hash_futex(union futex_key *key) { - u32 hash = jhash2((u32*)&key->both.word, - (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, key->both.offset); + return &futex_queues[hash & (futex_hashsize - 1)]; } -- cgit v1.2.3 From b0b9907a35bf5e2741586ea47057dad2caca8d3f Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 13 Mar 2020 20:33:07 +0000 Subject: genirq: Fix reference leaks on irq affinity notifiers commit df81dfcfd6991d547653d46c051bac195cd182c1 upstream. The handling of notify->work did not properly maintain notify->kref in two cases: 1) where the work was already scheduled, another irq_set_affinity_locked() would get the ref and (no-op-ly) schedule the work. Thus when irq_affinity_notify() ran, it would drop the original ref but not the additional one. 2) when cancelling the (old) work in irq_set_affinity_notifier(), if there was outstanding work a ref had been got for it but was never put. Fix both by checking the return values of the work handling functions (schedule_work() for (1) and cancel_work_sync() for (2)) and put the extra ref if the return value indicates preexisting work. Fixes: cd7eab44e994 ("genirq: Add IRQ affinity notifiers") Fixes: 59c39840f5ab ("genirq: Prevent use-after-free and work list corruption") Signed-off-by: Edward Cree Signed-off-by: Thomas Gleixner Acked-by: Ben Hutchings Link: https://lkml.kernel.org/r/24f5983f-2ab5-e83a-44ee-a45b5f9300f5@solarflare.com Signed-off-by: Greg Kroah-Hartman --- kernel/irq/manage.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 92c7eb1aeded..14aaaa61e905 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -220,7 +220,11 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, if (desc->affinity_notify) { kref_get(&desc->affinity_notify->kref); - schedule_work(&desc->affinity_notify->work); + if (!schedule_work(&desc->affinity_notify->work)) { + /* Work was already scheduled, drop our extra ref */ + kref_put(&desc->affinity_notify->kref, + desc->affinity_notify->release); + } } irqd_set(data, IRQD_AFFINITY_SET); @@ -320,7 +324,10 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { - cancel_work_sync(&old_notify->work); + if (cancel_work_sync(&old_notify->work)) { + /* Pending work had a ref, put that one too */ + kref_put(&old_notify->kref, old_notify->release); + } kref_put(&old_notify->kref, old_notify->release); } -- cgit v1.2.3 From 2350f8a157932333b3a06b55566d829098994b67 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 20 Mar 2020 10:48:13 +0100 Subject: bpf: Explicitly memset the bpf_attr structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8096f229421f7b22433775e928d506f0342e5907 upstream. For the bpf syscall, we are relying on the compiler to properly zero out the bpf_attr union that we copy userspace data into. Unfortunately that doesn't always work properly, padding and other oddities might not be correctly zeroed, and in some tests odd things have been found when the stack is pre-initialized to other values. Fix this by explicitly memsetting the structure to 0 before using it. Reported-by: Maciej Żenczykowski Reported-by: John Stultz Reported-by: Alexander Potapenko Reported-by: Alistair Delva Signed-off-by: Greg Kroah-Hartman Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://android-review.googlesource.com/c/kernel/common/+/1235490 Link: https://lore.kernel.org/bpf/20200320094813.GA421650@kroah.com Signed-off-by: Greg Kroah-Hartman --- kernel/bpf/syscall.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 04fc1022ad9f..fd3fd8d17ef5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -667,7 +667,7 @@ static int bpf_obj_get(const union bpf_attr *attr) SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { - union bpf_attr attr = {}; + union bpf_attr attr; int err; if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) @@ -703,6 +703,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz } /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + memset(&attr, 0, sizeof(attr)); if (copy_from_user(&attr, uattr, size) != 0) return -EFAULT; -- cgit v1.2.3