From e30e2fdfe56288576ee9e04dbb06b4bd5f282203 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 22 Dec 2011 02:45:29 +0530 Subject: VFS: Fix race between CPU hotplug and lglocks Currently, the *_global_[un]lock_online() routines are not at all synchronized with CPU hotplug. Soft-lockups detected as a consequence of this race was reported earlier at https://lkml.org/lkml/2011/8/24/185. (Thanks to Cong Meng for finding out that the root-cause of this issue is the race condition between br_write_[un]lock() and CPU hotplug, which results in the lock states getting messed up). Fixing this race by just adding {get,put}_online_cpus() at appropriate places in *_global_[un]lock_online() is not a good option, because, then suddenly br_write_[un]lock() would become blocking, whereas they have been kept as non-blocking all this time, and we would want to keep them that way. So, overall, we want to ensure 3 things: 1. br_write_lock() and br_write_unlock() must remain as non-blocking. 2. The corresponding lock and unlock of the per-cpu spinlocks must not happen for different sets of CPUs. 3. Either prevent any new CPU online operation in between this lock-unlock, or ensure that the newly onlined CPU does not proceed with its corresponding per-cpu spinlock unlocked. To achieve all this: (a) We introduce a new spinlock that is taken by the *_global_lock_online() routine and released by the *_global_unlock_online() routine. (b) We register a callback for CPU hotplug notifications, and this callback takes the same spinlock as above. (c) We maintain a bitmap which is close to the cpu_online_mask, and once it is initialized in the lock_init() code, all future updates to it are done in the callback, under the above spinlock. (d) The above bitmap is used (instead of cpu_online_mask) while locking and unlocking the per-cpu locks. The callback takes the spinlock upon the CPU_UP_PREPARE event. So, if the br_write_lock-unlock sequence is in progress, the callback keeps spinning, thus preventing the CPU online operation till the lock-unlock sequence is complete. This takes care of requirement (3). The bitmap that we maintain remains unmodified throughout the lock-unlock sequence, since all updates to it are managed by the callback, which takes the same spinlock as the one taken by the lock code and released only by the unlock routine. Combining this with (d) above, satisfies requirement (2). Overall, since we use a spinlock (mentioned in (a)) to prevent CPU hotplug operations from racing with br_write_lock-unlock, requirement (1) is also taken care of. By the way, it is to be noted that a CPU offline operation can actually run in parallel with our lock-unlock sequence, because our callback doesn't react to notifications earlier than CPU_DEAD (in order to maintain our bitmap properly). And this means, since we use our own bitmap (which is stale, on purpose) during the lock-unlock sequence, we could end up unlocking the per-cpu lock of an offline CPU (because we had locked it earlier, when the CPU was online), in order to satisfy requirement (2). But this is harmless, though it looks a bit awkward. Debugged-by: Cong Meng Signed-off-by: Srivatsa S. Bhat Signed-off-by: Al Viro Cc: stable@vger.kernel.org --- include/linux/lglock.h | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lglock.h b/include/linux/lglock.h index f549056fb20b..87f402ccec55 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -22,6 +22,7 @@ #include #include #include +#include /* can make br locks by using local lock for read side, global lock for write */ #define br_lock_init(name) name##_lock_init() @@ -72,9 +73,31 @@ #define DEFINE_LGLOCK(name) \ \ + DEFINE_SPINLOCK(name##_cpu_lock); \ + cpumask_t name##_cpus __read_mostly; \ DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \ DEFINE_LGLOCK_LOCKDEP(name); \ \ + static int \ + name##_lg_cpu_callback(struct notifier_block *nb, \ + unsigned long action, void *hcpu) \ + { \ + switch (action & ~CPU_TASKS_FROZEN) { \ + case CPU_UP_PREPARE: \ + spin_lock(&name##_cpu_lock); \ + cpu_set((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + break; \ + case CPU_UP_CANCELED: case CPU_DEAD: \ + spin_lock(&name##_cpu_lock); \ + cpu_clear((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + } \ + return NOTIFY_OK; \ + } \ + static struct notifier_block name##_lg_cpu_notifier = { \ + .notifier_call = name##_lg_cpu_callback, \ + }; \ void name##_lock_init(void) { \ int i; \ LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \ @@ -83,6 +106,11 @@ lock = &per_cpu(name##_lock, i); \ *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \ } \ + register_hotcpu_notifier(&name##_lg_cpu_notifier); \ + get_online_cpus(); \ + for_each_online_cpu(i) \ + cpu_set(i, name##_cpus); \ + put_online_cpus(); \ } \ EXPORT_SYMBOL(name##_lock_init); \ \ @@ -124,9 +152,9 @@ \ void name##_global_lock_online(void) { \ int i; \ - preempt_disable(); \ + spin_lock(&name##_cpu_lock); \ rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_lock(lock); \ @@ -137,12 +165,12 @@ void name##_global_unlock_online(void) { \ int i; \ rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_unlock(lock); \ } \ - preempt_enable(); \ + spin_unlock(&name##_cpu_lock); \ } \ EXPORT_SYMBOL(name##_global_unlock_online); \ \ -- cgit v1.2.3 From 4d25a066b69fb749a39d0d4c610689dd765a0b0e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 21 Dec 2011 12:28:29 +0100 Subject: KVM: Don't automatically expose the TSC deadline timer in cpuid Unlike all of the other cpuid bits, the TSC deadline timer bit is set unconditionally, regardless of what userspace wants. This is broken in several ways: - if userspace doesn't use KVM_CREATE_IRQCHIP, and doesn't emulate the TSC deadline timer feature, a guest that uses the feature will break - live migration to older host kernels that don't support the TSC deadline timer will cause the feature to be pulled from under the guest's feet; breaking it - guests that are broken wrt the feature will fail. Fix by not enabling the feature automatically; instead report it to userspace. Because the feature depends on KVM_CREATE_IRQCHIP, which we cannot guarantee will be called, we expose it via a KVM_CAP_TSC_DEADLINE_TIMER and not KVM_GET_SUPPORTED_CPUID. Fixes the Illumos guest kernel, which uses the TSC deadline timer feature. [avi: add the KVM_CAP + documentation] Reported-by: Alexey Zaytsev Tested-by: Alexey Zaytsev Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- include/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c3892fc1d538..68e67e50d028 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -557,6 +557,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_S390_GMAP 71 +#define KVM_CAP_TSC_DEADLINE_TIMER 72 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 30e053248da178cf6154bb7e950dc8713567e3fa Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 3 Jan 2012 13:14:29 +0100 Subject: security: Fix security_old_inode_init_security() when CONFIG_SECURITY is not set Commit 1e39f384bb01 ("evm: fix build problems") makes the stub version of security_old_inode_init_security() return 0 when CONFIG_SECURITY is not set. But that makes callers such as reiserfs_security_init() assume that security_old_inode_init_security() has set name, value, and len arguments properly - but security_old_inode_init_security() left them uninitialized which then results in interesting failures. Revert security_old_inode_init_security() to the old behavior of returning EOPNOTSUPP since both callers (reiserfs and ocfs2) handle this just fine. [ Also fixed the S_PRIVATE(inode) case of the actual non-stub security_old_inode_init_security() function to return EOPNOTSUPP for the same reason, as pointed out by Mimi Zohar. It got incorrectly changed to match the new function in commit fb88c2b6cbb1: "evm: fix security/security_old_init_security return code". - Linus ] Reported-by: Jorge Bastos Acked-by: James Morris Acked-by: Mimi Zohar Signed-off-by: Jan Kara Signed-off-by: Linus Torvalds --- include/linux/security.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 19d8e04e1688..e8c619d39291 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2056,7 +2056,7 @@ static inline int security_old_inode_init_security(struct inode *inode, char **name, void **value, size_t *len) { - return 0; + return -EOPNOTSUPP; } static inline int security_inode_create(struct inode *dir, -- cgit v1.2.3