From 1c0f4e0ebb791cad8fee77d8de1ceb684e631698 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 7 Dec 2016 14:54:38 +0100 Subject: hotplug: Make register and unregister notifier API symmetric commit 777c6e0daebb3fcefbbd6f620410a946b07ef6d0 upstream. Yu Zhao has noticed that __unregister_cpu_notifier only unregisters its notifiers when HOTPLUG_CPU=y while the registration might succeed even when HOTPLUG_CPU=n if MODULE is enabled. This means that e.g. zswap might keep a stale notifier on the list on the manual clean up during the pool tear down and thus corrupt the list. Resulting in the following [ 144.964346] BUG: unable to handle kernel paging request at ffff880658a2be78 [ 144.971337] IP: [] raw_notifier_chain_register+0x1b/0x40 [ 145.122628] Call Trace: [ 145.125086] [] __register_cpu_notifier+0x18/0x20 [ 145.131350] [] zswap_pool_create+0x273/0x400 [ 145.137268] [] __zswap_param_set+0x1fc/0x300 [ 145.143188] [] ? trace_hardirqs_on+0xd/0x10 [ 145.149018] [] ? kernel_param_lock+0x28/0x30 [ 145.154940] [] ? __might_fault+0x4f/0xa0 [ 145.160511] [] zswap_compressor_param_set+0x17/0x20 [ 145.167035] [] param_attr_store+0x5c/0xb0 [ 145.172694] [] module_attr_store+0x1d/0x30 [ 145.178443] [] sysfs_kf_write+0x4f/0x70 [ 145.183925] [] kernfs_fop_write+0x149/0x180 [ 145.189761] [] __vfs_write+0x18/0x40 [ 145.194982] [] vfs_write+0xb2/0x1a0 [ 145.200122] [] SyS_write+0x52/0xa0 [ 145.205177] [] entry_SYSCALL_64_fastpath+0x12/0x17 This can be even triggered manually by changing /sys/module/zswap/parameters/compressor multiple times. Fix this issue by making unregister APIs symmetric to the register so there are no surprises. Fixes: 47e627bc8c9a ("[PATCH] hotplug: Allow modules to use the cpu hotplug notifiers even if !CONFIG_HOTPLUG_CPU") Reported-and-tested-by: Yu Zhao Signed-off-by: Michal Hocko Cc: linux-mm@kvack.org Cc: Andrew Morton Cc: Dan Streetman Link: http://lkml.kernel.org/r/20161207135438.4310-1-mhocko@kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- include/linux/cpu.h | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d2ca8c38f9c4..3ea9aae2387d 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -131,22 +131,16 @@ enum { { .notifier_call = fn, .priority = pri }; \ __register_cpu_notifier(&fn##_nb); \ } -#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ -#define cpu_notifier(fn, pri) do { (void)(fn); } while (0) -#define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) -#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ -#ifdef CONFIG_HOTPLUG_CPU extern int register_cpu_notifier(struct notifier_block *nb); extern int __register_cpu_notifier(struct notifier_block *nb); extern void unregister_cpu_notifier(struct notifier_block *nb); extern void __unregister_cpu_notifier(struct notifier_block *nb); -#else -#ifndef MODULE -extern int register_cpu_notifier(struct notifier_block *nb); -extern int __register_cpu_notifier(struct notifier_block *nb); -#else +#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */ +#define cpu_notifier(fn, pri) do { (void)(fn); } while (0) +#define __cpu_notifier(fn, pri) do { (void)(fn); } while (0) + static inline int register_cpu_notifier(struct notifier_block *nb) { return 0; @@ -156,7 +150,6 @@ static inline int __register_cpu_notifier(struct notifier_block *nb) { return 0; } -#endif static inline void unregister_cpu_notifier(struct notifier_block *nb) { -- cgit v1.2.3 From 03eed7afbc09e061f66b448daf7863174c3dc3f3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 13 Oct 2016 21:23:16 -0500 Subject: mm: Add a user_ns owner to mm_struct and fix ptrace permission checks commit bfedb589252c01fa505ac9f6f2a3d5d68d707ef4 upstream. During exec dumpable is cleared if the file that is being executed is not readable by the user executing the file. A bug in ptrace_may_access allows reading the file if the executable happens to enter into a subordinate user namespace (aka clone(CLONE_NEWUSER), unshare(CLONE_NEWUSER), or setns(fd, CLONE_NEWUSER). This problem is fixed with only necessary userspace breakage by adding a user namespace owner to mm_struct, captured at the time of exec, so it is clear in which user namespace CAP_SYS_PTRACE must be present in to be able to safely give read permission to the executable. The function ptrace_may_access is modified to verify that the ptracer has CAP_SYS_ADMIN in task->mm->user_ns instead of task->cred->user_ns. This ensures that if the task changes it's cred into a subordinate user namespace it does not become ptraceable. The function ptrace_attach is modified to only set PT_PTRACE_CAP when CAP_SYS_PTRACE is held over task->mm->user_ns. The intent of PT_PTRACE_CAP is to be a flag to note that whatever permission changes the task might go through the tracer has sufficient permissions for it not to be an issue. task->cred->user_ns is always the same as or descendent of mm->user_ns. Which guarantees that having CAP_SYS_PTRACE over mm->user_ns is the worst case for the tasks credentials. To prevent regressions mm->dumpable and mm->user_ns are not considered when a task has no mm. As simply failing ptrace_may_attach causes regressions in privileged applications attempting to read things such as /proc//stat Acked-by: Kees Cook Tested-by: Cyrill Gorcunov Fixes: 8409cca70561 ("userns: allow ptrace from non-init user namespaces") Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- include/linux/mm_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f8d1492a114f..2ccccbfcd532 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -469,6 +469,7 @@ struct mm_struct { */ struct task_struct __rcu *owner; #endif + struct user_namespace *user_ns; /* store ref to file /proc//exe symlink points to */ struct file __rcu *exe_file; -- cgit v1.2.3 From 1c1f15f8ebfbd5042883a1c9ae4b18a6299c9c5f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Nov 2016 18:48:07 -0600 Subject: ptrace: Capture the ptracer's creds not PT_PTRACE_CAP commit 64b875f7ac8a5d60a4e191479299e931ee949b67 upstream. When the flag PT_PTRACE_CAP was added the PTRACE_TRACEME path was overlooked. This can result in incorrect behavior when an application like strace traces an exec of a setuid executable. Further PT_PTRACE_CAP does not have enough information for making good security decisions as it does not report which user namespace the capability is in. This has already allowed one mistake through insufficient granulariy. I found this issue when I was testing another corner case of exec and discovered that I could not get strace to set PT_PTRACE_CAP even when running strace as root with a full set of caps. This change fixes the above issue with strace allowing stracing as root a setuid executable without disabling setuid. More fundamentaly this change allows what is allowable at all times, by using the correct information in it's decision. Fixes: 4214e42f96d4 ("v2.4.9.11 -> v2.4.9.12") Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- include/linux/capability.h | 1 + include/linux/ptrace.h | 1 - include/linux/sched.h | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 5f8249d378a2..7d38697c7e63 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -249,6 +249,7 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) #endif /* CONFIG_MULTIUSER */ extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); +extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 504c98a278d4..e13bfdf7f314 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -19,7 +19,6 @@ #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ -#define PT_PTRACE_CAP 0x00000004 /* ptracer can follow suid-exec */ #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c0193baea2a..ce0f61dcd887 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1540,6 +1540,7 @@ struct task_struct { struct list_head cpu_timers[3]; /* process credentials */ + const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */ const struct cred __rcu *real_cred; /* objective and real subjective task * credentials (COW) */ const struct cred __rcu *cred; /* effective (overridable) subjective task -- cgit v1.2.3 From b35f34f66943fa1b8feadb037d054a141723f7fe Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 16 Nov 2016 22:06:51 -0600 Subject: exec: Ensure mm->user_ns contains the execed files commit f84df2a6f268de584a201e8911384a2d244876e3 upstream. When the user namespace support was merged the need to prevent ptrace from revealing the contents of an unreadable executable was overlooked. Correct this oversight by ensuring that the executed file or files are in mm->user_ns, by adjusting mm->user_ns. Use the new function privileged_wrt_inode_uidgid to see if the executable is a member of the user namespace, and as such if having CAP_SYS_PTRACE in the user namespace should allow tracing the executable. If not update mm->user_ns to the parent user namespace until an appropriate parent is found. Reported-by: Jann Horn Fixes: 9e4a36ece652 ("userns: Fail exec for suid and sgid binaries with ids outside our user namespace.") Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- include/linux/capability.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 7d38697c7e63..2654f75a4c46 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -247,6 +247,7 @@ static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap) return true; } #endif /* CONFIG_MULTIUSER */ +extern bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode); extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); -- cgit v1.2.3 From becfb50c66cb290f05552cb8c9b46493540a14ed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Mar 2016 14:27:11 -0700 Subject: cred/userns: define current_user_ns() as a function commit 0335695dfa4df01edff5bb102b9a82a0668ee51e upstream. The current_user_ns() macro currently returns &init_user_ns when user namespaces are disabled, and that causes several warnings when building with gcc-6.0 in code that compares the result of the macro to &init_user_ns itself: fs/xfs/xfs_ioctl.c: In function 'xfs_ioctl_setattr_check_projid': fs/xfs/xfs_ioctl.c:1249:22: error: self-comparison always evaluates to true [-Werror=tautological-compare] if (current_user_ns() == &init_user_ns) This is a legitimate warning in principle, but here it isn't really helpful, so I'm reprasing the definition in a way that shuts up the warning. Apparently gcc only warns when comparing identical literals, but it can figure out that the result of an inline function can be identical to a constant expression in order to optimize a condition yet not warn about the fact that the condition is known at compile time. This is exactly what we want here, and it looks reasonable because we generally prefer inline functions over macros anyway. Signed-off-by: Arnd Bergmann Acked-by: Serge Hallyn Cc: David Howells Cc: Yaowei Bai Cc: James Morris Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/capability.h | 2 -- include/linux/cred.h | 5 ++++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/capability.h b/include/linux/capability.h index 2654f75a4c46..b20ffe23a09b 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -40,8 +40,6 @@ struct inode; struct dentry; struct user_namespace; -struct user_namespace *current_user_ns(void); - extern const kernel_cap_t __cap_empty_set; extern const kernel_cap_t __cap_init_eff_set; diff --git a/include/linux/cred.h b/include/linux/cred.h index 8d70e1361ecd..257db64562e5 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -377,7 +377,10 @@ extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) #else -#define current_user_ns() (&init_user_ns) +static inline struct user_namespace *current_user_ns(void) +{ + return &init_user_ns; +} #endif -- cgit v1.2.3 From 67b21973ecf4775a5761a9d4ecc0fc9c7994ea78 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 10 Jan 2017 12:24:15 -0800 Subject: gro: Disable frag0 optimization on IPv6 ext headers [ Upstream commit 57ea52a865144aedbcd619ee0081155e658b6f7d ] The GRO fast path caches the frag0 address. This address becomes invalid if frag0 is modified by pskb_may_pull or its variants. So whenever that happens we must disable the frag0 optimization. This is usually done through the combination of gro_header_hard and gro_header_slow, however, the IPv6 extension header path did the pulling directly and would continue to use the GRO fast path incorrectly. This patch fixes it by disabling the fast path when we enter the IPv6 extension header path. Fixes: 78a478d0efd9 ("gro: Inline skb_gro_header and cache frag0 virtual address") Reported-by: Slava Shwartsman Signed-off-by: Herbert Xu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/netdevice.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 9d6025703f73..93a6a2c66d15 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2325,14 +2325,19 @@ static inline int skb_gro_header_hard(struct sk_buff *skb, unsigned int hlen) return NAPI_GRO_CB(skb)->frag0_len < hlen; } +static inline void skb_gro_frag0_invalidate(struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->frag0 = NULL; + NAPI_GRO_CB(skb)->frag0_len = 0; +} + static inline void *skb_gro_header_slow(struct sk_buff *skb, unsigned int hlen, unsigned int offset) { if (!pskb_may_pull(skb, hlen)) return NULL; - NAPI_GRO_CB(skb)->frag0 = NULL; - NAPI_GRO_CB(skb)->frag0_len = 0; + skb_gro_frag0_invalidate(skb); return skb->data + offset; } -- cgit v1.2.3 From 3d27cd4b25272943300d125b20d315fd96ae0794 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 16 Dec 2016 14:30:35 -0800 Subject: jump_labels: API for flushing deferred jump label updates commit b6416e61012429e0277bd15a229222fd17afc1c1 upstream. Modules that use static_key_deferred need a way to synchronize with any delayed work that is still pending when the module is unloaded. Introduce static_key_deferred_flush() which flushes any pending jump label updates. Signed-off-by: David Matlack Acked-by: Peter Zijlstra (Intel) Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- include/linux/jump_label_ratelimit.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h index 089f70f83e97..23da3af459fe 100644 --- a/include/linux/jump_label_ratelimit.h +++ b/include/linux/jump_label_ratelimit.h @@ -14,6 +14,7 @@ struct static_key_deferred { #ifdef HAVE_JUMP_LABEL extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern void static_key_deferred_flush(struct static_key_deferred *key); extern void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); @@ -26,6 +27,10 @@ static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) STATIC_KEY_CHECK_USE(); static_key_slow_dec(&key->key); } +static inline void static_key_deferred_flush(struct static_key_deferred *key) +{ + STATIC_KEY_CHECK_USE(); +} static inline void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) -- cgit v1.2.3 From 0f64f22c0686ca66cb5d0c88d1322d7cf32e1986 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 22 Jan 2017 14:04:29 -0500 Subject: nfs: Don't increment lock sequence ID after NFS4ERR_MOVED commit 059aa734824165507c65fd30a55ff000afd14983 upstream. Xuan Qi reports that the Linux NFSv4 client failed to lock a file that was migrated. The steps he observed on the wire: 1. The client sent a LOCK request to the source server 2. The source server replied NFS4ERR_MOVED 3. The client switched to the destination server 4. The client sent the same LOCK request to the destination server with a bumped lock sequence ID 5. The destination server rejected the LOCK request with NFS4ERR_BAD_SEQID RFC 3530 section 8.1.5 provides a list of NFS errors which do not bump a lock sequence ID. However, RFC 3530 is now obsoleted by RFC 7530. In RFC 7530 section 9.1.7, this list has been updated by the addition of NFS4ERR_MOVED. Reported-by: Xuan Qi Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- include/linux/nfs4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index e7e78537aea2..63a817631f06 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -266,7 +266,7 @@ enum nfsstat4 { static inline bool seqid_mutating_err(u32 err) { - /* rfc 3530 section 8.1.5: */ + /* See RFC 7530, section 9.1.7 */ switch (err) { case NFS4ERR_STALE_CLIENTID: case NFS4ERR_STALE_STATEID: @@ -275,6 +275,7 @@ static inline bool seqid_mutating_err(u32 err) case NFS4ERR_BADXDR: case NFS4ERR_RESOURCE: case NFS4ERR_NOFILEHANDLE: + case NFS4ERR_MOVED: return false; }; return true; -- cgit v1.2.3 From d79e41188a1e20cb71fab70c7a76f7b0ab3ea01c Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 20 Jan 2017 16:48:39 +0800 Subject: SUNRPC: cleanup ida information when removing sunrpc module commit c929ea0b910355e1876c64431f3d5802f95b3d75 upstream. After removing sunrpc module, I get many kmemleak information as, unreferenced object 0xffff88003316b1e0 (size 544): comm "gssproxy", pid 2148, jiffies 4294794465 (age 4200.081s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc+0x15e/0x1f0 [] ida_pre_get+0xaa/0x150 [] ida_simple_get+0xad/0x180 [] nlmsvc_lookup_host+0x4ab/0x7f0 [lockd] [] lockd+0x4d/0x270 [lockd] [] param_set_timeout+0x55/0x100 [lockd] [] svc_defer+0x114/0x3f0 [sunrpc] [] svc_defer+0x2d7/0x3f0 [sunrpc] [] rpc_show_info+0x8a/0x110 [sunrpc] [] proc_reg_write+0x7f/0xc0 [] __vfs_write+0xdf/0x3c0 [] vfs_write+0xef/0x240 [] SyS_write+0xad/0x130 [] entry_SYSCALL_64_fastpath+0x1a/0xa9 [] 0xffffffffffffffff I found, the ida information (dynamic memory) isn't cleanup. Signed-off-by: Kinglong Mee Fixes: 2f048db4680a ("SUNRPC: Add an identifier for struct rpc_clnt") Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- include/linux/sunrpc/clnt.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 9b6027c51736..316a5525b730 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -180,5 +180,6 @@ const char *rpc_peeraddr2str(struct rpc_clnt *, enum rpc_display_format_t); int rpc_localaddr(struct rpc_clnt *, struct sockaddr *, size_t); const char *rpc_proc_name(const struct rpc_task *task); +void rpc_cleanup_clids(void); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ -- cgit v1.2.3 From df3ebc9cd524a9ff3fb070803b5271f1127d0610 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 15:29:51 -0700 Subject: ANDROID: mnt: Add filesystem private data to mount points This starts to add private data associated directly to mount points. The intent is to give filesystems a sense of where they have come from, as a means of letting a filesystem take different actions based on this information. Change-Id: Ie769d7b3bb2f5972afe05c1bf16cf88c91647ab2 Signed-off-by: Daniel Rosenberg --- include/linux/fs.h | 3 +++ include/linux/mount.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e1a123760dbf..87858f72a143 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1733,6 +1733,8 @@ struct super_operations { int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); + void *(*clone_mnt_data) (void *); + void (*copy_mnt_data) (void *, void *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); @@ -1967,6 +1969,7 @@ struct file_system_type { #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); + void *(*alloc_mnt_data) (void); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; diff --git a/include/linux/mount.h b/include/linux/mount.h index f822c3c11377..0e9b0977237a 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -67,6 +67,7 @@ struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ int mnt_flags; + void *data; }; struct file; /* forward dec */ -- cgit v1.2.3 From 11eebf69848eeeedeff3e3dae0588eb7175b0351 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 15:58:22 -0700 Subject: ANDROID: vfs: Allow filesystems to access their private mount data Now we pass the vfsmount when mounting and remounting. This allows the filesystem to actually set up the mount specific data, although we can't quite do anything with it yet. show_options is expanded to include data that lives with the mount. To avoid changing existing filesystems, these have been added as new vfs functions. Change-Id: If80670bfad9f287abb8ac22457e1b034c9697097 Signed-off-by: Daniel Rosenberg --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 87858f72a143..e26cf250d97d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1733,11 +1733,13 @@ struct super_operations { int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); + int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *); void *(*clone_mnt_data) (void *); void (*copy_mnt_data) (void *, void *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); + int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); @@ -1969,6 +1971,8 @@ struct file_system_type { #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); + struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int, + const char *, void *); void *(*alloc_mnt_data) (void); void (*kill_sb) (struct super_block *); struct module *owner; -- cgit v1.2.3 From db2d40bb63f1906fde48a3b14fc9bb2e609d87db Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 16:27:45 -0700 Subject: ANDROID: vfs: Add permission2 for filesystems with per mount permissions This allows filesystems to use their mount private data to influence the permssions they return in permission2. It has been separated into a new call to avoid disrupting current permission users. Change-Id: I9d416e3b8b6eca84ef3e336bd2af89ddd51df6ca Signed-off-by: Daniel Rosenberg --- include/linux/fs.h | 11 +++++++++++ include/linux/namei.h | 1 + 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index e26cf250d97d..eded3090afce 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1539,13 +1539,21 @@ extern bool inode_owner_or_capable(const struct inode *inode); * VFS helper functions.. */ extern int vfs_create(struct inode *, struct dentry *, umode_t, bool); +extern int vfs_create2(struct vfsmount *, struct inode *, struct dentry *, umode_t, bool); extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); +extern int vfs_mkdir2(struct vfsmount *, struct inode *, struct dentry *, umode_t); extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); +extern int vfs_mknod2(struct vfsmount *, struct inode *, struct dentry *, umode_t, dev_t); extern int vfs_symlink(struct inode *, struct dentry *, const char *); +extern int vfs_symlink2(struct vfsmount *, struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **); +extern int vfs_link2(struct vfsmount *, struct dentry *, struct inode *, struct dentry *, struct inode **); extern int vfs_rmdir(struct inode *, struct dentry *); +extern int vfs_rmdir2(struct vfsmount *, struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); +extern int vfs_unlink2(struct vfsmount *, struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); +extern int vfs_rename2(struct vfsmount *, struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); extern int vfs_whiteout(struct inode *, struct dentry *); /* @@ -1671,6 +1679,7 @@ struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*follow_link) (struct dentry *, void **); int (*permission) (struct inode *, int); + int (*permission2) (struct vfsmount *, struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); @@ -2478,7 +2487,9 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); +extern int inode_permission2(struct vfsmount *, struct inode *, int); extern int __inode_permission(struct inode *, int); +extern int __inode_permission2(struct vfsmount *, struct inode *, int); extern int generic_permission(struct inode *, int); extern int __check_sticky(struct inode *dir, struct inode *inode); diff --git a/include/linux/namei.h b/include/linux/namei.h index d53c25453aca..023359f18567 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -79,6 +79,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); +extern struct dentry *lookup_one_len2(const char *, struct vfsmount *mnt, struct dentry *, int); extern int follow_down_one(struct path *); extern int follow_down(struct path *); -- cgit v1.2.3 From e5eeaaf5f7319d0db83925c2729dcfb0cf1a4dbc Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 26 Oct 2016 16:33:11 -0700 Subject: ANDROID: vfs: Add setattr2 for filesystems with per mount permissions This allows filesystems to use their mount private data to influence the permssions they use in setattr2. It has been separated into a new call to avoid disrupting current setattr users. Change-Id: I19959038309284448f1b7f232d579674ef546385 Signed-off-by: Daniel Rosenberg --- include/linux/fs.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index eded3090afce..7f6de206cc5c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1697,6 +1697,7 @@ struct inode_operations { int (*rename2) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*setattr) (struct dentry *, struct iattr *); + int (*setattr2) (struct vfsmount *, struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t); @@ -2262,6 +2263,8 @@ struct filename { extern long vfs_truncate(struct path *, loff_t); extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); +extern int do_truncate2(struct vfsmount *, struct dentry *, loff_t start, + unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern long do_sys_open(int dfd, const char __user *filename, int flags, @@ -2486,6 +2489,7 @@ extern void emergency_remount(void); extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); +extern int notify_change2(struct vfsmount *, struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); extern int inode_permission2(struct vfsmount *, struct inode *, int); extern int __inode_permission(struct inode *, int); -- cgit v1.2.3 From b82981fb273d3d0759bff5a55449f4c22f977869 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Thu, 12 Jan 2017 14:24:58 -0800 Subject: tcp: fix tcp_fastopen unaligned access complaints on sparc [ Upstream commit 003c941057eaa868ca6fedd29a274c863167230d ] Fix up a data alignment issue on sparc by swapping the order of the cookie byte array field with the length field in struct tcp_fastopen_cookie, and making it a proper union to clean up the typecasting. This addresses log complaints like these: log_unaligned: 113 callbacks suppressed Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360 Kernel unaligned access at TPC[9764ac] tcp_try_fastopen+0x2ec/0x360 Kernel unaligned access at TPC[9764c8] tcp_try_fastopen+0x308/0x360 Kernel unaligned access at TPC[9764e4] tcp_try_fastopen+0x324/0x360 Kernel unaligned access at TPC[976490] tcp_try_fastopen+0x2d0/0x360 Cc: Eric Dumazet Signed-off-by: Shannon Nelson Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/linux/tcp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b386361ba3e8..318c24612458 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -56,8 +56,13 @@ static inline unsigned int tcp_optlen(const struct sk_buff *skb) /* TCP Fast Open Cookie as stored in memory */ struct tcp_fastopen_cookie { + union { + u8 val[TCP_FASTOPEN_COOKIE_MAX]; +#if IS_ENABLED(CONFIG_IPV6) + struct in6_addr addr; +#endif + }; s8 len; - u8 val[TCP_FASTOPEN_COOKIE_MAX]; bool exp; /* In RFC6994 experimental option format */ }; -- cgit v1.2.3 From b3c8c31ed24c39f9a9fb9bee1e42b16640ff0bee Mon Sep 17 00:00:00 2001 From: Douglas Miller Date: Sat, 28 Jan 2017 06:42:20 -0600 Subject: percpu-refcount: fix reference leak during percpu-atomic transition commit 966d2b04e070bc040319aaebfec09e0144dc3341 upstream. percpu_ref_tryget() and percpu_ref_tryget_live() should return "true" IFF they acquire a reference. But the return value from atomic_long_inc_not_zero() is a long and may have high bits set, e.g. PERCPU_COUNT_BIAS, and the return value of the tryget routines is bool so the reference may actually be acquired but the routines return "false" which results in a reference leak since the caller assumes it does not need to do a corresponding percpu_ref_put(). This was seen when performing CPU hotplug during I/O, as hangs in blk_mq_freeze_queue_wait where percpu_ref_kill (blk_mq_freeze_queue_start) raced with percpu_ref_tryget (blk_mq_timeout_work). Sample stack trace: __switch_to+0x2c0/0x450 __schedule+0x2f8/0x970 schedule+0x48/0xc0 blk_mq_freeze_queue_wait+0x94/0x120 blk_mq_queue_reinit_work+0xb8/0x180 blk_mq_queue_reinit_prepare+0x84/0xa0 cpuhp_invoke_callback+0x17c/0x600 cpuhp_up_callbacks+0x58/0x150 _cpu_up+0xf0/0x1c0 do_cpu_up+0x120/0x150 cpu_subsys_online+0x64/0xe0 device_online+0xb4/0x120 online_store+0xb4/0xc0 dev_attr_store+0x68/0xa0 sysfs_kf_write+0x80/0xb0 kernfs_fop_write+0x17c/0x250 __vfs_write+0x6c/0x1e0 vfs_write+0xd0/0x270 SyS_write+0x6c/0x110 system_call+0x38/0xe0 Examination of the queue showed a single reference (no PERCPU_COUNT_BIAS, and __PERCPU_REF_DEAD, __PERCPU_REF_ATOMIC set) and no requests. However, conditions at the time of the race are count of PERCPU_COUNT_BIAS + 0 and __PERCPU_REF_DEAD and __PERCPU_REF_ATOMIC set. The fix is to make the tryget routines use an actual boolean internally instead of the atomic long result truncated to a int. Fixes: e625305b3907 percpu-refcount: make percpu_ref based on longs instead of ints Link: https://bugzilla.kernel.org/show_bug.cgi?id=190751 Signed-off-by: Douglas Miller Reviewed-by: Jens Axboe Signed-off-by: Tejun Heo Fixes: e625305b3907 ("percpu-refcount: make percpu_ref based on longs instead of ints") Signed-off-by: Greg Kroah-Hartman --- include/linux/percpu-refcount.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 12c9b485beb7..abd7c01c84db 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -206,7 +206,7 @@ static inline void percpu_ref_get(struct percpu_ref *ref) static inline bool percpu_ref_tryget(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; - int ret; + bool ret; rcu_read_lock_sched(); @@ -240,7 +240,7 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) { unsigned long __percpu *percpu_count; - int ret = false; + bool ret = false; rcu_read_lock_sched(); -- cgit v1.2.3 From 87ebcc534d47dd924327daa651c36f876db76f72 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Fri, 3 Feb 2017 13:13:23 -0800 Subject: base/memory, hotplug: fix a kernel oops in show_valid_zones() commit a96dfddbcc04336bbed50dc2b24823e45e09e80c upstream. Reading a sysfs "memoryN/valid_zones" file leads to the following oops when the first page of a range is not backed by struct page. show_valid_zones() assumes that 'start_pfn' is always valid for page_zone(). BUG: unable to handle kernel paging request at ffffea017a000000 IP: show_valid_zones+0x6f/0x160 This issue may happen on x86-64 systems with 64GiB or more memory since their memory block size is bumped up to 2GiB. [1] An example of such systems is desribed below. 0x3240000000 is only aligned by 1GiB and this memory block starts from 0x3200000000, which is not backed by struct page. BIOS-e820: [mem 0x0000003240000000-0x000000603fffffff] usable Since test_pages_in_a_zone() already checks holes, fix this issue by extending this function to return 'valid_start' and 'valid_end' for a given range. show_valid_zones() then proceeds with the valid range. [1] 'Commit bdee237c0343 ("x86: mm: Use 2GB memory block size on large-memory x86-64 systems")' Link: http://lkml.kernel.org/r/20170127222149.30893-3-toshi.kani@hpe.com Signed-off-by: Toshi Kani Cc: Greg Kroah-Hartman Cc: Zhang Zhen Cc: Reza Arbab Cc: David Rientjes Cc: Dan Williams Cc: [4.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/memory_hotplug.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2ea574ff9714..538488bd1d3d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -85,7 +85,8 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long, unsigned long, int); -extern int test_pages_in_a_zone(unsigned long, unsigned long); +extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, + unsigned long *valid_start, unsigned long *valid_end); extern void __offline_isolated_pages(unsigned long, unsigned long); typedef void (*online_page_callback_t)(struct page *page); -- cgit v1.2.3 From 3b7ff5ed114fdb65517ee564a717b9c5fa374c94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Feb 2017 14:30:56 -0800 Subject: cpumask: use nr_cpumask_bits for parsing functions commit 4d59b6ccf000862beed6fc0765d3209f98a8d8a2 upstream. Commit 513e3d2d11c9 ("cpumask: always use nr_cpu_ids in formatting and parsing functions") converted both cpumask printing and parsing functions to use nr_cpu_ids instead of nr_cpumask_bits. While this was okay for the printing functions as it just picked one of the two output formats that we were alternating between depending on a kernel config, doing the same for parsing wasn't okay. nr_cpumask_bits can be either nr_cpu_ids or NR_CPUS. We can always use nr_cpu_ids but that is a variable while NR_CPUS is a constant, so it can be more efficient to use NR_CPUS when we can get away with it. Converting the printing functions to nr_cpu_ids makes sense because it affects how the masks get presented to userspace and doesn't break anything; however, using nr_cpu_ids for parsing functions can incorrectly leave the higher bits uninitialized while reading in these masks from userland. As all testing and comparison functions use nr_cpumask_bits which can be larger than nr_cpu_ids, the parsed cpumasks can erroneously yield false negative results. This made the taskstats interface incorrectly return -EINVAL even when the inputs were correct. Fix it by restoring the parse functions to use nr_cpumask_bits instead of nr_cpu_ids. Link: http://lkml.kernel.org/r/20170206182442.GB31078@htj.duckdns.org Fixes: 513e3d2d11c9 ("cpumask: always use nr_cpu_ids in formatting and parsing functions") Signed-off-by: Tejun Heo Reported-by: Martin Steigerwald Debugged-by: Ben Hutchings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/cpumask.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 59915ea5373c..a91b3b75da0f 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -556,7 +556,7 @@ static inline void cpumask_copy(struct cpumask *dstp, static inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { - return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpu_ids); + return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** @@ -571,7 +571,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), - nr_cpu_ids); + nr_cpumask_bits); } /** @@ -586,7 +586,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) char *nl = strchr(buf, '\n'); unsigned int len = nl ? (unsigned int)(nl - buf) : strlen(buf); - return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpu_ids); + return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** @@ -598,7 +598,7 @@ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) */ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) { - return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpu_ids); + return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } /** -- cgit v1.2.3