From b3a084b9b684622b149e8dcf03855bf0d5fb588b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Sep 2010 08:38:44 +0200 Subject: rcu: rcu_read_lock_bh_held(): disabling irqs also disables bh rcu_dereference_bh() doesnt know yet about hard irq being disabled, so lockdep can trigger in netpoll_rx() after commit f0f9deae9e7c4 (netpoll: Disable IRQ around RCU dereference in netpoll_rx) Reported-by: Miles Lane Signed-off-by: Eric Dumazet Tested-by: Miles Lane Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 9fbc54a2585d..83af1f8d8b74 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -454,7 +454,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) \ - rcu_dereference_check(p, rcu_read_lock_bh_held()) + rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled()) /** * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched -- cgit v1.2.3 From 2fc11536cf5c0b8eb4eb7e01a2a672a189e9280f Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Tue, 7 Sep 2010 06:10:45 -0300 Subject: V4L/DVB: videobuf-dma-sg: set correct size in last sg element This fixes a nasty memory corruption bug when using userptr I/O. The function videobuf_pages_to_sg() sets up the scatter-gather list for the DMA transfer to the userspace pages. The first transfer is setup correctly (the size is set to PAGE_SIZE - offset), but all other transfers have size PAGE_SIZE. This is wrong for the last transfer which may be less than PAGE_SIZE. Most, if not all, drivers will program the boards DMA engine correctly, i.e. even though the size in the last sg element is wrong, they will do their own size calculations and make sure the right amount is DMA-ed, and so seemingly prevent memory corruption. However, behind the scenes the dynamic DMA mapping support (in lib/swiotlb.c) may create bounce buffers if the memory pages are not in DMA-able memory. This happens for example on a 64-bit linux with a board that only supports 32-bit DMA. These bounce buffers DO use the information in the sg list to determine the size. So while the DMA engine transfers the correct amount of data, when the data is 'bounced' back too much is copied, causing buffer overwrites. The fix is simple: calculate and set the correct size for the last sg list element. Signed-off-by: Hans Verkuil Cc: stable@kernel.org Signed-off-by: Mauro Carvalho Chehab --- include/media/videobuf-dma-sg.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/media/videobuf-dma-sg.h b/include/media/videobuf-dma-sg.h index 97e07f46a0fa..aa4ebb42a565 100644 --- a/include/media/videobuf-dma-sg.h +++ b/include/media/videobuf-dma-sg.h @@ -48,6 +48,7 @@ struct videobuf_dmabuf { /* for userland buffer */ int offset; + size_t size; struct page **pages; /* for kernel buffers */ -- cgit v1.2.3 From e454c844644683571617896ab2a4ce0109c1943e Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Tue, 21 Sep 2010 16:31:11 -0300 Subject: Bluetooth: Fix deadlock in the ERTM logic The Enhanced Retransmission Mode(ERTM) is a realiable mode of operation of the Bluetooth L2CAP layer. Think on it like a simplified version of TCP. The problem we were facing here was a deadlock. ERTM uses a backlog queue to queue incomimg packets while the user is helding the lock. At some moment the sk_sndbuf can be exceeded and we can't alloc new skbs then the code sleep with the lock to wait for memory, that stalls the ERTM connection once we can't read the acknowledgements packets in the backlog queue to free memory and make the allocation of outcoming skb successful. This patch actually affect all users of bt_skb_send_alloc(), i.e., all L2CAP modes and SCO. We are safe against socket states changes or channels deletion while the we are sleeping wait memory. Checking for the sk->sk_err and sk->sk_shutdown make the code safe, since any action that can leave the socket or the channel in a not usable state set one of the struct members at least. Then we can check both of them when getting the lock again and return with the proper error if something unexpected happens. Signed-off-by: Gustavo F. Padovan Signed-off-by: Ulisses Furquim --- include/net/bluetooth/bluetooth.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 27a902d9b3a9..30fce0128dd7 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -161,12 +161,30 @@ static inline struct sk_buff *bt_skb_send_alloc(struct sock *sk, unsigned long l { struct sk_buff *skb; + release_sock(sk); if ((skb = sock_alloc_send_skb(sk, len + BT_SKB_RESERVE, nb, err))) { skb_reserve(skb, BT_SKB_RESERVE); bt_cb(skb)->incoming = 0; } + lock_sock(sk); + + if (!skb && *err) + return NULL; + + *err = sock_error(sk); + if (*err) + goto out; + + if (sk->sk_shutdown) { + *err = -ECONNRESET; + goto out; + } return skb; + +out: + kfree_skb(skb); + return NULL; } int bt_err(__u16 code); -- cgit v1.2.3 From 5336377d6225959624146629ce3fc88ee8ecda3d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 5 Oct 2010 11:29:27 -0700 Subject: modules: Fix module_bug_list list corruption race With all the recent module loading cleanups, we've minimized the code that sits under module_mutex, fixing various deadlocks and making it possible to do most of the module loading in parallel. However, that whole conversion totally missed the rather obscure code that adds a new module to the list for BUG() handling. That code was doubly obscure because (a) the code itself lives in lib/bugs.c (for dubious reasons) and (b) it gets called from the architecture-specific "module_finalize()" rather than from generic code. Calling it from arch-specific code makes no sense what-so-ever to begin with, and is now actively wrong since that code isn't protected by the module loading lock any more. So this commit moves the "module_bug_{finalize,cleanup}()" calls away from the arch-specific code, and into the generic code - and in the process protects it with the module_mutex so that the list operations are now safe. Future fixups: - move the module list handling code into kernel/module.c where it belongs. - get rid of 'module_bug_list' and just use the regular list of modules (called 'modules' - imagine that) that we already create and maintain for other reasons. Reported-and-tested-by: Thomas Gleixner Cc: Rusty Russell Cc: Adrian Bunk Cc: Andrew Morton Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/module.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/module.h b/include/linux/module.h index 8a6b9fdc7ffa..aace066bad8f 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -686,17 +686,16 @@ extern int module_sysfs_initialized; #ifdef CONFIG_GENERIC_BUG -int module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, +void module_bug_finalize(const Elf_Ehdr *, const Elf_Shdr *, struct module *); void module_bug_cleanup(struct module *); #else /* !CONFIG_GENERIC_BUG */ -static inline int module_bug_finalize(const Elf_Ehdr *hdr, +static inline void module_bug_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod) { - return 0; } static inline void module_bug_cleanup(struct module *mod) {} #endif /* CONFIG_GENERIC_BUG */ -- cgit v1.2.3 From 231d0aefd88e94129cb8fb84794f9bb788c6366e Mon Sep 17 00:00:00 2001 From: Evgeny Kuznetsov Date: Tue, 5 Oct 2010 12:47:57 +0400 Subject: wait: using uninitialized member of wait queue The "flags" member of "struct wait_queue_t" is used in several places in the kernel code without beeing initialized by init_wait(). "flags" is used in bitwise operations. If "flags" not initialized then unexpected behaviour may take place. Incorrect flags might used later in code. Added initialization of "wait_queue_t.flags" with zero value into "init_wait". Signed-off-by: Evgeny Kuznetsov [ The bit we care about does end up being initialized by both prepare_to_wait() and add_to_wait_queue(), so this doesn't seem to cause actual bugs, but is definitely the right thing to do -Linus ] Signed-off-by: Linus Torvalds --- include/linux/wait.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 0836ccc57121..3efc9f3f43a0 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -614,6 +614,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); (wait)->private = current; \ (wait)->func = autoremove_wake_function; \ INIT_LIST_HEAD(&(wait)->task_list); \ + (wait)->flags = 0; \ } while (0) /** -- cgit v1.2.3 From 1df6a2ebd75067aefbdf07482bf8e3d0584e04ee Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Thu, 30 Sep 2010 12:36:45 +0200 Subject: drm/ttm: Fix two race conditions + fix busy codepaths This fixes a race pointed out by Dave Airlie where we don't take a buffer object about to be destroyed off the LRU lists properly. It also fixes a rare case where a buffer object could be destroyed in the middle of an accelerated eviction. The patch also adds a utility function that can be used to prematurely release GPU memory space usage of an object waiting to be destroyed. For example during eviction or swapout. The above mentioned commit didn't queue the buffer on the delayed destroy list under some rare circumstances. It also didn't completely honor the remove_all parameter. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=615505 http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=591061 Signed-off-by: Thomas Hellstrom Signed-off-by: Dave Airlie --- include/drm/ttm/ttm_bo_api.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/ttm/ttm_bo_api.h b/include/drm/ttm/ttm_bo_api.h index 267a86c74e2e..2040e6c4f172 100644 --- a/include/drm/ttm/ttm_bo_api.h +++ b/include/drm/ttm/ttm_bo_api.h @@ -246,9 +246,11 @@ struct ttm_buffer_object { atomic_t reserved; - /** * Members protected by the bo::lock + * In addition, setting sync_obj to anything else + * than NULL requires bo::reserved to be held. This allows for + * checking NULL while reserved but not holding bo::lock. */ void *sync_obj_arg; -- cgit v1.2.3 From 430c62fb2948d964cf8dc7f3e2f69623c04ef62f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Oct 2010 09:35:16 +0200 Subject: elevator: fix oops on early call to elevator_change() 2.6.36 introduces an API for drivers to switch the IO scheduler instead of manually calling the elevator exit and init functions. This API was added since q->elevator must be cleared in between those two calls. And since we already have this functionality directly from use by the sysfs interface to switch schedulers online, it was prudent to reuse it internally too. But this API needs the queue to be in a fully initialized state before it is called, or it will attempt to unregister elevator kobjects before they have been added. This results in an oops like this: BUG: unable to handle kernel NULL pointer dereference at 0000000000000051 IP: [] sysfs_create_dir+0x2e/0xc0 PGD 47ddfc067 PUD 47c6a1067 PMD 0 Oops: 0000 [#1] PREEMPT SMP last sysfs file: /sys/devices/pci0000:00/0000:00:02.0/0000:04:00.1/irq CPU 2 Modules linked in: t(+) loop hid_apple usbhid ahci ehci_hcd uhci_hcd libahci usbcore nls_base igb Pid: 7319, comm: modprobe Not tainted 2.6.36-rc6+ #132 QSSC-S4R/QSSC-S4R RIP: 0010:[] [] sysfs_create_dir+0x2e/0xc0 RSP: 0018:ffff88027da25d08 EFLAGS: 00010246 RAX: ffff88047c68c528 RBX: 00000000fffffffe RCX: 0000000000000000 RDX: 000000000000002f RSI: 000000000000002f RDI: ffff88047e196c88 RBP: ffff88027da25d38 R08: 0000000000000000 R09: d84156c5635688c0 R10: d84156c5635688c0 R11: 0000000000000000 R12: ffff88047e196c88 R13: 0000000000000000 R14: 0000000000000000 R15: ffff88047c68c528 FS: 00007fcb0b26f6e0(0000) GS:ffff880287400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000051 CR3: 000000047e76e000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process modprobe (pid: 7319, threadinfo ffff88027da24000, task ffff88027d377090) Stack: ffff88027da25d58 ffff88047c68c528 00000000fffffffe ffff88047e196c88 <0> ffff88047c68c528 ffff88047e05bd90 ffff88027da25d78 ffffffff8123fb77 <0> ffff88047e05bd90 0000000000000000 ffff88047e196c88 ffff88047c68c528 Call Trace: [] kobject_add_internal+0xe7/0x1f0 [] kobject_add_varg+0x38/0x60 [] kobject_add+0x69/0x90 [] ? sysfs_remove_dir+0x20/0xa0 [] ? sub_preempt_count+0x9d/0xe0 [] ? _raw_spin_unlock+0x30/0x50 [] ? sysfs_remove_dir+0x20/0xa0 [] ? sysfs_remove_dir+0x34/0xa0 [] elv_register_queue+0x34/0xa0 [] elevator_change+0xfd/0x250 [] ? t_init+0x0/0x361 [t] [] ? t_init+0x0/0x361 [t] [] t_init+0xa8/0x361 [t] [] do_one_initcall+0x3e/0x170 [] sys_init_module+0xbd/0x220 [] system_call_fastpath+0x16/0x1b Code: e5 41 56 41 55 41 54 49 89 fc 53 48 83 ec 10 48 85 ff 74 52 48 8b 47 18 49 c7 c5 00 46 61 81 48 85 c0 74 04 4c 8b 68 30 45 31 f6 <41> 80 7d 51 00 74 0e 49 8b 44 24 28 4c 89 e7 ff 50 20 49 89 c6 RIP [] sysfs_create_dir+0x2e/0xc0 RSP CR2: 0000000000000051 ---[ end trace a6541d3bf07945df ]--- Fix this by adding a registered bit to the elevator queue, which is set when the sysfs kobjects have been registered. Signed-off-by: Jens Axboe --- include/linux/elevator.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 926b50322a46..4fd978e7eb83 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -93,6 +93,7 @@ struct elevator_queue struct elevator_type *elevator_type; struct mutex sysfs_lock; struct hlist_head *hash; + unsigned int registered:1; }; /* -- cgit v1.2.3 From 7c5347733dcc4ba0bac0baf86d99fae0561f33b7 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 11 Oct 2010 18:13:31 -0400 Subject: fanotify: disable fanotify syscalls This patch disables the fanotify syscalls by just not building them and letting the cond_syscall() statements in kernel/sys_ni.c redirect them to sys_ni_syscall(). It was pointed out by Tvrtko Ursulin that the fanotify interface did not include an explicit prioritization between groups. This is necessary for fanotify to be usable for hierarchical storage management software, as they must get first access to the file, before inotify-like notifiers see the file. This feature can be added in an ABI compatible way in the next release (by using a number of bits in the flags field to carry the info) but it was suggested by Alan that maybe we should just hold off and do it in the next cycle, likely with an (new) explicit argument to the syscall. I don't like this approach best as I know people are already starting to use the current interface, but Alan is all wise and noone on list backed me up with just using what we have. I feel this is needlessly ripping the rug out from under people at the last minute, but if others think it needs to be a new argument it might be the best way forward. Three choices: Go with what we got (and implement the new feature next cycle). Add a new field right now (and implement the new feature next cycle). Wait till next cycle to release the ABI (and implement the new feature next cycle). This is number 3. Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds --- include/linux/Kbuild | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 626b629429ff..4e8ea8c8ec1e 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -118,7 +118,6 @@ header-y += eventpoll.h header-y += ext2_fs.h header-y += fadvise.h header-y += falloc.h -header-y += fanotify.h header-y += fb.h header-y += fcntl.h header-y += fd.h -- cgit v1.2.3 From 0eead9ab41da33644ae2c97c57ad03da636a0422 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Oct 2010 10:57:40 -0700 Subject: Don't dump task struct in a.out core-dumps akiphie points out that a.out core-dumps have that odd task struct dumping that was never used and was never really a good idea (it goes back into the mists of history, probably the original core-dumping code). Just remove it. Also do the access_ok() check on dump_write(). It probably doesn't matter (since normal filesystems all seem to do it anyway), but he points out that it's normally done by the VFS layer, so ... [ I suspect that we should possibly do "vfs_write()" instead of calling ->write directly. That also does the whole fsnotify and write statistics thing, which may or may not be a good idea. ] And just to be anal, do this all for the x86-64 32-bit a.out emulation code too, even though it's not enabled (and won't currently even compile) Reported-by: akiphie Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 8ba66a9d9022..59579cfee6a0 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -11,7 +11,7 @@ */ static inline int dump_write(struct file *file, const void *addr, int nr) { - return file->f_op->write(file, addr, nr, &file->f_pos) == nr; + return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; } static inline int dump_seek(struct file *file, loff_t off) -- cgit v1.2.3 From 3aa0ce825ade0cf5506e32ccf51d01fc8d22a9cf Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 14 Oct 2010 14:32:06 -0700 Subject: Un-inline the core-dump helper functions Tony Luck reports that the addition of the access_ok() check in commit 0eead9ab41da ("Don't dump task struct in a.out core-dumps") broke the ia64 compile due to missing the necessary header file includes. Rather than add yet another include () to make everything happy, just uninline the silly core dump helper functions and move the bodies to fs/exec.c where they make a lot more sense. dump_seek() in particular was too big to be an inline function anyway, and none of them are in any way performance-critical. And we really don't need to mess up our include file headers more than they already are. Reported-and-tested-by: Tony Luck Signed-off-by: Linus Torvalds --- include/linux/coredump.h | 34 ++-------------------------------- 1 file changed, 2 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 59579cfee6a0..ba4b85a6d9b8 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -9,37 +9,7 @@ * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. */ -static inline int dump_write(struct file *file, const void *addr, int nr) -{ - return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr; -} - -static inline int dump_seek(struct file *file, loff_t off) -{ - int ret = 1; - - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - if (file->f_op->llseek(file, off, SEEK_CUR) < 0) - return 0; - } else { - char *buf = (char *)get_zeroed_page(GFP_KERNEL); - - if (!buf) - return 0; - while (off > 0) { - unsigned long n = off; - - if (n > PAGE_SIZE) - n = PAGE_SIZE; - if (!dump_write(file, buf, n)) { - ret = 0; - break; - } - off -= n; - } - free_page((unsigned long)buf); - } - return ret; -} +extern int dump_write(struct file *file, const void *addr, int nr); +extern int dump_seek(struct file *file, loff_t off); #endif /* _LINUX_COREDUMP_H */ -- cgit v1.2.3 From 79b5dc0c64d88cda3da23b2e22a5cec0964372ac Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 15 Oct 2010 14:34:14 -0700 Subject: types.h: define __aligned_u64 and expose to userspace We currently have a kernel internal type called aligned_u64 which aligns __u64's on 8 bytes boundaries even on systems which would normally align them on 4 byte boundaries. This patch creates a new type __aligned_u64 which does the same thing but which is exposed to userspace rather than being kernel internal. [akpm: merge early as both the net and audit trees want this] [akpm@linux-foundation.org: enhance the comment describing the reasons for using aligned_u64. Via Andreas and Andi.] Based-on-patch-by: Andreas Gruenbacher Signed-off-by: Eric Paris Cc: Jan Engelhardt Cc: David Miller Cc: Andi Kleen Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/types.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/types.h b/include/linux/types.h index 01a082f56ef4..357dbc19606f 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -121,7 +121,15 @@ typedef __u64 u_int64_t; typedef __s64 int64_t; #endif -/* this is a special 64bit data type that is 8-byte aligned */ +/* + * aligned_u64 should be used in defining kernel<->userspace ABIs to avoid + * common 32/64-bit compat problems. + * 64-bit values align to 4-byte boundaries on x86_32 (and possibly other + * architectures) and to 8-byte boundaries on 64-bit architetures. The new + * aligned_64 type enforces 8-byte alignment so that structs containing + * aligned_64 values have the same alignment on 32-bit and 64-bit architectures. + * No conversions are necessary between 32-bit user-space and a 64-bit kernel. + */ #define aligned_u64 __u64 __attribute__((aligned(8))) #define aligned_be64 __be64 __attribute__((aligned(8))) #define aligned_le64 __le64 __attribute__((aligned(8))) @@ -178,6 +186,11 @@ typedef __u64 __bitwise __be64; typedef __u16 __bitwise __sum16; typedef __u32 __bitwise __wsum; +/* this is a special 64bit data type that is 8-byte aligned */ +#define __aligned_u64 __u64 __attribute__((aligned(8))) +#define __aligned_be64 __be64 __attribute__((aligned(8))) +#define __aligned_le64 __le64 __attribute__((aligned(8))) + #ifdef __KERNEL__ typedef unsigned __bitwise__ gfp_t; typedef unsigned __bitwise__ fmode_t; -- cgit v1.2.3