From f6d87f4bd259cf33e092cd1a8fde05f291c47af1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Nov 2008 13:18:30 +0100 Subject: genirq: keep affinities set from userspace across free/request_irq() Impact: preserve user-modified affinities on interrupts Kumar Galak noticed that commit 18404756765c713a0be4eb1082920c04822ce588 (genirq: Expose default irq affinity mask (take 3)) overrides an already set affinity setting across a free / request_irq(). Happens e.g. with ifdown/ifup of a network device. Change the logic to mark the affinities as set and keep them intact. This also fixes the unlocked access to irq_desc in irq_select_affinity() when called from irq_affinity_proc_write() Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/irq.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index d058c57be02d..36b186eb318b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -63,7 +63,8 @@ typedef void (*irq_flow_handler_t)(unsigned int irq, #define IRQ_MOVE_PENDING 0x00200000 /* need to re-target IRQ destination */ #define IRQ_NO_BALANCING 0x00400000 /* IRQ is excluded from balancing */ #define IRQ_SPURIOUS_DISABLED 0x00800000 /* IRQ was disabled by the spurious trap */ -#define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ +#define IRQ_MOVE_PCNTXT 0x01000000 /* IRQ migration from process context */ +#define IRQ_AFFINITY_SET 0x02000000 /* IRQ affinity was set from userspace*/ #ifdef CONFIG_IRQ_PER_CPU # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) @@ -210,7 +211,6 @@ extern int setup_irq(unsigned int irq, struct irqaction *new); #ifdef CONFIG_GENERIC_PENDING_IRQ -void set_pending_irq(unsigned int irq, cpumask_t mask); void move_native_irq(int irq); void move_masked_irq(int irq); @@ -228,10 +228,6 @@ static inline void move_masked_irq(int irq) { } -static inline void set_pending_irq(unsigned int irq, cpumask_t mask) -{ -} - #endif /* CONFIG_GENERIC_PENDING_IRQ */ #else /* CONFIG_SMP */ -- cgit v1.2.3 From 2ed1cdcf9a83205d1343f29b630abff232eaa72c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 21 Nov 2008 16:59:57 -0800 Subject: irq.h: fix missing/extra kernel-doc Impact: fix kernel-doc build Fix missing & excess irq.h kernel-doc: Warning(include/linux/irq.h:182): No description found for parameter 'irq' Warning(include/linux/irq.h:182): Excess struct/union/enum/typedef member 'affinity_entry' description in 'irq_desc' Signed-off-by: Randy Dunlap Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/irq.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 36b186eb318b..3dddfa703ebd 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -131,7 +131,7 @@ struct irq_chip { /** * struct irq_desc - interrupt descriptor - * + * @irq: interrupt number for this descriptor * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] * @chip: low level interrupt hardware access * @msi_desc: MSI descriptor @@ -150,7 +150,6 @@ struct irq_chip { * @cpu: cpu index useful for balancing * @pending_mask: pending rebalanced interrupts * @dir: /proc/irq/ procfs entry - * @affinity_entry: /proc/irq/smp_affinity procfs entry on SMP * @name: flow handler name for /proc/interrupts output */ struct irq_desc { -- cgit v1.2.3 From f79fca55f9a6fe54635ad32ddc8a38f92a94ec30 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 24 Nov 2008 16:06:17 -0800 Subject: netfilter: xtables: add missing const qualifier to xt_tgchk_param When entryinfo was a standalone parameter to functions, it used to be "const void *". Put the const back in. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/x_tables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index be41b609c88f..e52ce475d19f 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -251,7 +251,7 @@ struct xt_target_param { */ struct xt_tgchk_param { const char *table; - void *entryinfo; + const void *entryinfo; const struct xt_target *target; void *targinfo; unsigned int hook_mask; -- cgit v1.2.3 From 487ff32082a9bd7489d8185cf7d7a2fdf18a22fa Mon Sep 17 00:00:00 2001 From: Russell King Date: Thu, 27 Nov 2008 11:13:58 +0000 Subject: Allow architectures to override copy_user_highpage() With aliasing VIPT cache support, the ARM implementation of clear_user_page() and copy_user_page() sets up a temporary kernel space mapping such that we have the same cache colour as the userspace page. This avoids having to consider any userspace aliases from this operation. However, when highmem is enabled, kmap_atomic() have to setup mappings. The copy_user_highpage() and clear_user_highpage() call these functions before delegating the copies to copy_user_page() and clear_user_page(). The effect of this is that each of the *_user_highpage() functions setup their own kmap mapping, followed by the *_user_page() functions setting up another mapping. This is rather wasteful. Thankfully, copy_user_highpage() can be overriden by architectures by defining __HAVE_ARCH_COPY_USER_HIGHPAGE. However, replacement of clear_user_highpage() is more difficult because its inline definition is not conditional. It seems that you're expected to define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE and provide a replacement __alloc_zeroed_user_highpage() implementation instead. The allocation itself is fine, so we don't want to override that. What we really want to do is to override clear_user_highpage() with our own version which doesn't kmap_atomic() unnecessarily. Other VIPT architectures (PARISC and SH) would also like to override this function as well. Acked-by: Hugh Dickins Acked-by: James Bottomley Acked-by: Paul Mundt Signed-off-by: Russell King --- include/linux/highmem.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 7dcbc82f3b7b..13875ce9112a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -63,12 +63,14 @@ static inline void *kmap_atomic(struct page *page, enum km_type idx) #endif /* CONFIG_HIGHMEM */ /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ +#ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_atomic(page, KM_USER0); clear_user_page(addr, vaddr, page); kunmap_atomic(addr, KM_USER0); } +#endif #ifndef __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE /** -- cgit v1.2.3 From 9a5aa622dd4cd22b5e0fe83e4a9c0c768d4e2dea Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Fri, 28 Nov 2008 21:29:46 -0800 Subject: mlx4_core: Save/restore default port IB capability mask Commit 7ff93f8b ("mlx4_core: Multiple port type support") introduced support for different port types. As part of that support, SET_PORT is invoked to set the port type during driver startup. However, as a side-effect, for IB ports the invocation of this command also sets the port's capability mask to zero (losing the default value set by FW). To fix this, get the default ib port capabilities (via a MAD_IFC Port Info query) during driver startup, and save them for use in the mlx4_SET_PORT command when setting the port-type to Infiniband. This patch fixes problems with subnet manager (SM) failover such as , which occurred because the IsTrapSupported bit in the capability mask was zeroed. Signed-off-by: Jack Morgenstein Signed-off-by: Roland Dreier --- include/linux/mlx4/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index bd9977b89490..371086fd946f 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -179,6 +179,7 @@ struct mlx4_caps { int num_ports; int vl_cap[MLX4_MAX_PORTS + 1]; int ib_mtu_cap[MLX4_MAX_PORTS + 1]; + __be32 ib_port_def_cap[MLX4_MAX_PORTS + 1]; u64 def_mac[MLX4_MAX_PORTS + 1]; int eth_mtu_cap[MLX4_MAX_PORTS + 1]; int gid_table_len[MLX4_MAX_PORTS + 1]; -- cgit v1.2.3 From 31168481c32c8a485e1003af9433124dede57f8d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Nov 2008 17:33:24 +0000 Subject: meminit section warnings Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index f546ad6fc028..1e6d34bfa094 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -17,7 +17,7 @@ struct page_cgroup { struct list_head lru; /* per cgroup LRU list */ }; -void __init pgdat_page_cgroup_init(struct pglist_data *pgdat); +void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); void __init page_cgroup_init(void); struct page_cgroup *lookup_page_cgroup(struct page *page); @@ -91,7 +91,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; -static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat) +static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) { } -- cgit v1.2.3 From 02d0e6753d8ab0173b63338157929e52eac86d12 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Nov 2008 17:38:34 +0000 Subject: hotplug_memory_notifier section annotation Same as for hotplug_cpu - we want static notifier_block in there in meminitdata, to avoid false positives whenever it's used. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- include/linux/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index 2f5f8a5ef2a0..36c82c9e6ea7 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -91,7 +91,7 @@ extern int memory_notify(unsigned long val, void *v); #ifdef CONFIG_MEMORY_HOTPLUG #define hotplug_memory_notifier(fn, pri) { \ - static struct notifier_block fn##_mem_nb = \ + static __meminitdata struct notifier_block fn##_mem_nb =\ { .notifier_call = fn, .priority = pri }; \ register_memory_notifier(&fn##_mem_nb); \ } -- cgit v1.2.3 From 96b8936a9ed08746e47081458a5eb9e43a751e24 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Nov 2008 08:10:03 +0100 Subject: remove __ARCH_WANT_COMPAT_SYS_PTRACE All architectures now use the generic compat_sys_ptrace, as should every new architecture that needs 32bit compat (if we'll ever get another). Remove the now superflous __ARCH_WANT_COMPAT_SYS_PTRACE define, and also kill a comment about __ARCH_SYS_PTRACE that was added after __ARCH_SYS_PTRACE was already gone. Signed-off-by: Christoph Hellwig Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- include/linux/compat.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compat.h b/include/linux/compat.h index f061a1ea1b74..e88f3ecf38b4 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -252,12 +252,10 @@ extern int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); -#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, compat_long_t addr, compat_long_t data); -#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ /* * epoll (fs/eventpoll.c) compat bits follow ... -- cgit v1.2.3 From ac70a964b0e22a95af3628c344815857a01461b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 27 Nov 2008 13:36:48 +0900 Subject: libata: blacklist Seagate drives which time out FLUSH_CACHE when used with NCQ Some recent Seagate harddrives have firmware bug which causes FLUSH CACHE to timeout under certain circumstances if NCQ is being used. This can be worked around by disabling NCQ and fixed by updating the firmware. Implement ATA_HORKAGE_FIRMWARE_UPDATE and blacklist these devices. The wiki page has been updated to contain information on this issue. http://ata.wiki.kernel.org/index.php/Known_issues Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 59b0f1c807b5..ed3f26eb5df1 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -375,6 +375,7 @@ enum { ATA_HORKAGE_BRIDGE_OK = (1 << 10), /* no bridge limits */ ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands not multiple of 16 bytes */ + ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firwmare update warning */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3 From 7ef9964e6d1b911b78709f144000aacadd0ebc21 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Mon, 1 Dec 2008 13:13:55 -0800 Subject: epoll: introduce resource usage limits It has been thought that the per-user file descriptors limit would also limit the resources that a normal user can request via the epoll interface. Vegard Nossum reported a very simple program (a modified version attached) that can make a normal user to request a pretty large amount of kernel memory, well within the its maximum number of fds. To solve such problem, default limits are now imposed, and /proc based configuration has been introduced. A new directory has been created, named /proc/sys/fs/epoll/ and inside there, there are two configuration points: max_user_instances = Maximum number of devices - per user max_user_watches = Maximum number of "watched" fds - per user The current default for "max_user_watches" limits the memory used by epoll to store "watches", to 1/32 of the amount of the low RAM. As example, a 256MB 32bit machine, will have "max_user_watches" set to roughly 90000. That should be enough to not break existing heavy epoll users. The default value for "max_user_instances" is set to 128, that should be enough too. This also changes the userspace, because a new error code can now come out from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already listed, so that should be ok. [akpm@linux-foundation.org: use get_current_user()] Signed-off-by: Davide Libenzi Cc: Michael Kerrisk Cc: Cc: Cyrill Gorcunov Reported-by: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 644ffbda17ca..55e30d114477 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -630,6 +630,10 @@ struct user_struct { atomic_t inotify_watches; /* How many inotify watches does this user have? */ atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ #endif +#ifdef CONFIG_EPOLL + atomic_t epoll_devs; /* The number of epoll descriptors currently open */ + atomic_t epoll_watches; /* The number of file descriptors currently watched */ +#endif #ifdef CONFIG_POSIX_MQUEUE /* protected by mq_lock */ unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ -- cgit v1.2.3 From 6ff2d39b91aec3dcae951afa982059e3dd9b49dc Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Mon, 1 Dec 2008 13:14:02 -0800 Subject: lib/idr.c: fix rcu related race with idr_find 2nd part of the fixes needed for http://bugzilla.kernel.org/show_bug.cgi?id=11796. When the idr tree is either grown or shrunk, then the update to the number of layers and the top pointer were not atomic. This race caused crashes. The attached patch fixes that by replicating the layers counter in each layer, thus idr_find doesn't need idp->layers anymore. Signed-off-by: Manfred Spraul Cc: Clement Calmels Cc: Nadia Derbey Cc: Pierre Peiffer Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/idr.h b/include/linux/idr.h index fa035f96f2a3..dd846df8cd32 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -52,13 +52,14 @@ struct idr_layer { unsigned long bitmap; /* A zero bit means "space here" */ struct idr_layer *ary[1< Date: Tue, 2 Dec 2008 20:40:03 +0100 Subject: amd74xx: workaround unreliable AltStatus register for nVidia controllers It seems that on some nVidia controllers using AltStatus register can be unreliable so default to Status register if the PCI device is in Compatibility Mode. In order to achieve this: * Add ide_pci_is_in_compatibility_mode() inline helper to . * Add IDE_HFLAG_BROKEN_ALTSTATUS host flag and set it in amd74xx host driver for nVidia controllers in Compatibility Mode. * Teach actual_try_to_identify() and drive_is_ready() about the new flag. This fixes the regression caused by removal of CONFIG_IDEPCI_SHARE_IRQ config option in 2.6.25 and using AltStatus register unconditionally when available (kernel.org bugs #11659 and #10216). [ Moreover for CONFIG_IDEPCI_SHARE_IRQ=y (which is what most people and distributions use) it never worked correctly. ] Thanks to Remy LABENE and Lars Winterfeld for help with debugging the problem. More info at: http://bugzilla.kernel.org/show_bug.cgi?id=11659 http://bugzilla.kernel.org/show_bug.cgi?id=10216 Reported-by: Remy LABENE Tested-by: Remy LABENE Tested-by: Lars Winterfeld Acked-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 54525be4b5f8..010fb26a1579 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1296,6 +1296,13 @@ extern int __ide_pci_register_driver(struct pci_driver *driver, struct module *o #define ide_pci_register_driver(d) pci_register_driver(d) #endif +static inline int ide_pci_is_in_compatibility_mode(struct pci_dev *dev) +{ + if ((dev->class >> 8) == PCI_CLASS_STORAGE_IDE && (dev->class & 5) != 5) + return 1; + return 0; +} + void ide_pci_setup_ports(struct pci_dev *, const struct ide_port_info *, int, hw_regs_t *, hw_regs_t **); void ide_setup_pci_noise(struct pci_dev *, const struct ide_port_info *); @@ -1375,6 +1382,7 @@ enum { IDE_HFLAG_IO_32BIT = (1 << 24), /* unmask IRQs */ IDE_HFLAG_UNMASK_IRQS = (1 << 25), + IDE_HFLAG_BROKEN_ALTSTATUS = (1 << 26), /* serialize ports if DMA is possible (for sl82c105) */ IDE_HFLAG_SERIALIZE_DMA = (1 << 27), /* force host out of "simplex" mode */ -- cgit v1.2.3 From 1b79cd04fab80be61dcd2732e2423aafde9a4c1c Mon Sep 17 00:00:00 2001 From: "Junjiro R. Okajima" Date: Tue, 2 Dec 2008 10:31:46 -0800 Subject: nfsd: fix vm overcommit crash fix #2 The previous patch from Alan Cox ("nfsd: fix vm overcommit crash", commit 731572d39fcd3498702eda4600db4c43d51e0b26) fixed the problem where knfsd crashes on exported shmemfs objects and strict overcommit is set. But the patch forgot supporting the case when CONFIG_SECURITY is disabled. This patch copies a part of his fix which is mainly for detecting a bug earlier. Acked-by: James Morris Signed-off-by: Alan Cox Signed-off-by: Junjiro R. Okajima Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/security.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index c13f1cec9abb..e3d4ecda2673 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1818,17 +1818,21 @@ static inline int security_settime(struct timespec *ts, struct timezone *tz) static inline int security_vm_enough_memory(long pages) { + WARN_ON(current->mm == NULL); return cap_vm_enough_memory(current->mm, pages); } -static inline int security_vm_enough_memory_kern(long pages) +static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { - return cap_vm_enough_memory(current->mm, pages); + WARN_ON(mm == NULL); + return cap_vm_enough_memory(mm, pages); } -static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) +static inline int security_vm_enough_memory_kern(long pages) { - return cap_vm_enough_memory(mm, pages); + /* If current->mm is a kernel thread then we will pass NULL, + for this specific case that is fine */ + return cap_vm_enough_memory(current->mm, pages); } static inline int security_bprm_alloc(struct linux_binprm *bprm) -- cgit v1.2.3 From 53a08807c01989c6847bb135d8d43f61c5dfdda5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Dec 2008 12:41:26 +0100 Subject: block: internal dequeue shouldn't start timer blkdev_dequeue_request() and elv_dequeue_request() are equivalent and both start the timeout timer. Barrier code dequeues the original barrier request but doesn't passes the request itself to lower level driver, only broken down proxy requests; however, as the original barrier code goes through the same dequeue path and timeout timer is started on it. If barrier sequence takes long enough, this timer expires but the low level driver has no idea about this request and oops follows. Timeout timer shouldn't have been started on the original barrier request as it never goes through actual IO. This patch unexports elv_dequeue_request(), which has no external user anyway, and makes it operate on elevator proper w/o adding the timer and make blkdev_dequeue_request() call elv_dequeue_request() and add timer. Internal users which don't pass the request to driver - barrier code and end_that_request_last() - are converted to use elv_dequeue_request(). Signed-off-by: Tejun Heo Cc: Mike Anderson Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a135256b272c..9cc7cc5fdce1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -786,6 +786,8 @@ static inline void blk_run_address_space(struct address_space *mapping) blk_run_backing_dev(mapping->backing_dev_info, NULL); } +extern void blkdev_dequeue_request(struct request *req); + /* * blk_end_request() and friends. * __blk_end_request() and end_request() must be called with @@ -820,11 +822,6 @@ extern void blk_update_request(struct request *rq, int error, extern unsigned int blk_rq_bytes(struct request *rq); extern unsigned int blk_rq_cur_bytes(struct request *rq); -static inline void blkdev_dequeue_request(struct request *req) -{ - elv_dequeue_request(req->q, req); -} - /* * Access functions for manipulating queue properties */ -- cgit v1.2.3 From 0e435ac26e3f951d83338ed3d4ab7dc0fe0055bc Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 3 Dec 2008 12:55:08 +0100 Subject: block: fix setting of max_segment_size and seg_boundary mask Fix setting of max_segment_size and seg_boundary mask for stacked md/dm devices. When stacking devices (LVM over MD over SCSI) some of the request queue parameters are not set up correctly in some cases by default, namely max_segment_size and and seg_boundary mask. If you create MD device over SCSI, these attributes are zeroed. Problem become when there is over this mapping next device-mapper mapping - queue attributes are set in DM this way: request_queue max_segment_size seg_boundary_mask SCSI 65536 0xffffffff MD RAID1 0 0 LVM 65536 -1 (64bit) Unfortunately bio_add_page (resp. bio_phys_segments) calculates number of physical segments according to these parameters. During the generic_make_request() is segment cout recalculated and can increase bio->bi_phys_segments count over the allowed limit. (After bio_clone() in stack operation.) Thi is specially problem in CCISS driver, where it produce OOPS here BUG_ON(creq->nr_phys_segments > MAXSGENTRIES); (MAXSEGENTRIES is 31 by default.) Sometimes even this command is enough to cause oops: dd iflag=direct if=/dev// of=/dev/null bs=128000 count=10 This command generates bios with 250 sectors, allocated in 32 4k-pages (last page uses only 1024 bytes). For LVM layer, it allocates bio with 31 segments (still OK for CCISS), unfortunatelly on lower layer it is recalculated to 32 segments and this violates CCISS restriction and triggers BUG_ON(). The patch tries to fix it by: * initializing attributes above in queue request constructor blk_queue_make_request() * make sure that blk_queue_stack_limits() inherits setting (DM uses its own function to set the limits because it blk_queue_stack_limits() was introduced later. It should probably switch to use generic stack limit function too.) * sets the default seg_boundary value in one place (blkdev.h) * use this mask as default in DM (instead of -1, which differs in 64bit) Bugs related to this: https://bugzilla.redhat.com/show_bug.cgi?id=471639 http://bugzilla.kernel.org/show_bug.cgi?id=8672 Signed-off-by: Milan Broz Reviewed-by: Alasdair G Kergon Cc: Neil Brown Cc: FUJITA Tomonori Cc: Tejun Heo Cc: Mike Miller Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cc7cc5fdce1..6dcd30d806cd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -918,6 +918,8 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter); #define MAX_SEGMENT_SIZE 65536 +#define BLK_SEG_BOUNDARY_MASK 0xFFFFFFFFUL + #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) static inline int queue_hardsect_size(struct request_queue *q) -- cgit v1.2.3 From fd4ce1acd0f8558033b1a6968001552bd7671e6d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Nov 2008 14:58:42 +0100 Subject: [PATCH 1/2] kill FMODE_NDELAY_NOW Update FMODE_NDELAY before each ioctl call so that we can kill the magic FMODE_NDELAY_NOW. It would be even better to do this directly in setfl(), but for that we'd need to have FMODE_NDELAY for all files, not just block special files. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 0dcdd9458f4b..b3345a90e11a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -79,7 +79,6 @@ extern int dir_notify_enable; #define FMODE_NDELAY ((__force fmode_t)32) #define FMODE_EXCL ((__force fmode_t)64) #define FMODE_WRITE_IOCTL ((__force fmode_t)128) -#define FMODE_NDELAY_NOW ((__force fmode_t)256) #define RW_MASK 1 #define RWA_MASK 2 -- cgit v1.2.3 From fc9161e54d0dbf799beff9692ea1cc6237162b85 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Nov 2008 14:58:46 +0100 Subject: [PATCH 2/2] documnt FMODE_ constants Make sure all FMODE_ constants are documents, and ensure a coherent style for the already existing comments. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/fs.h | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b3345a90e11a..4a853ef6fd35 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -63,21 +63,23 @@ extern int dir_notify_enable; #define MAY_ACCESS 16 #define MAY_OPEN 32 -#define FMODE_READ ((__force fmode_t)1) -#define FMODE_WRITE ((__force fmode_t)2) - -/* Internal kernel extensions */ -#define FMODE_LSEEK ((__force fmode_t)4) -#define FMODE_PREAD ((__force fmode_t)8) -#define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ - -/* File is being opened for execution. Primary users of this flag are - distributed filesystems that can use it to achieve correct ETXTBUSY - behavior for cross-node execution/opening_for_writing of files */ -#define FMODE_EXEC ((__force fmode_t)16) - -#define FMODE_NDELAY ((__force fmode_t)32) -#define FMODE_EXCL ((__force fmode_t)64) +/* file is open for reading */ +#define FMODE_READ ((__force fmode_t)1) +/* file is open for writing */ +#define FMODE_WRITE ((__force fmode_t)2) +/* file is seekable */ +#define FMODE_LSEEK ((__force fmode_t)4) +/* file can be accessed using pread/pwrite */ +#define FMODE_PREAD ((__force fmode_t)8) +#define FMODE_PWRITE FMODE_PREAD /* These go hand in hand */ +/* File is opened for execution with sys_execve / sys_uselib */ +#define FMODE_EXEC ((__force fmode_t)16) +/* File is opened with O_NDELAY (only set for block devices) */ +#define FMODE_NDELAY ((__force fmode_t)32) +/* File is opened with O_EXCL (only set for block devices) */ +#define FMODE_EXCL ((__force fmode_t)64) +/* File is opened using open(.., 3, ..) and is writeable only for ioctls + (specialy hack for floppy.c) */ #define FMODE_WRITE_IOCTL ((__force fmode_t)128) #define RW_MASK 1 -- cgit v1.2.3 From f2f1fa78a155524b849edf359e42a3001ea652c0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Dec 2008 14:49:18 -0800 Subject: Enforce a minimum SG_IO timeout There's no point in having too short SG_IO timeouts, since if the command does end up timing out, we'll end up through the reset sequence that is several seconds long in order to abort the command that timed out. As a result, shorter timeouts than a few seconds simply do not make sense, as the recovery would be longer than the timeout itself. Add a BLK_MIN_SG_TIMEOUT to match the existign BLK_DEFAULT_SG_TIMEOUT. Suggested-by: Alan Cox Acked-by: Tejun Heo Acked-by: Jens Axboe Cc: Jeff Garzik Signed-off-by: Linus Torvalds --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6dcd30d806cd..031a315c0509 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -662,6 +662,7 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn; * default timeout for SG_IO if none specified */ #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) +#define BLK_MIN_SG_TIMEOUT (7 * HZ) #ifdef CONFIG_BOUNCE extern int init_emergency_isa_pool(void); -- cgit v1.2.3