From 796d5116c407690b14fd5bda136aa67a39e7061a Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Thu, 2 Jun 2011 21:19:05 +0200 Subject: iosched: prevent aliased requests from starving other I/O Hi, Jens, If you recall, I posted an RFC patch for this back in July of last year: http://lkml.org/lkml/2010/7/13/279 The basic problem is that a process can issue a never-ending stream of async direct I/Os to the same sector on a device, thus starving out other I/O in the system (due to the way the alias handling works in both cfq and deadline). The solution I proposed back then was to start dispatching from the fifo after a certain number of aliases had been dispatched. Vivek asked why we had to treat aliases differently at all, and I never had a good answer. So, I put together a simple patch which allows aliases to be added to the rb tree (it adds them to the right, though that doesn't matter as the order isn't guaranteed anyway). I think this is the preferred solution, as it doesn't break up time slices in CFQ or batches in deadline. I've tested it, and it does solve the starvation issue. Let me know what you think. Cheers, Jeff Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 21a8ebf2dc3a..d800d5142184 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -146,7 +146,7 @@ extern struct request *elv_rb_latter_request(struct request_queue *, struct requ /* * rb support functions. */ -extern struct request *elv_rb_add(struct rb_root *, struct request *); +extern void elv_rb_add(struct rb_root *, struct request *); extern void elv_rb_del(struct rb_root *, struct request *); extern struct request *elv_rb_find(struct rb_root *, sector_t); -- cgit v1.2.3 From 4d0d98b60eba726e0a4f3e6617628b070c444707 Mon Sep 17 00:00:00 2001 From: Wanlong Gao Date: Mon, 13 Jun 2011 10:45:38 +0200 Subject: block:fix the comment error in blkdev.h There is not a function rq_init but blk_rq_init in block/blk-core.c. Signed-off-by: Wanlong Gao Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ae9091a68480..4ce6e68da2bd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -73,7 +73,7 @@ enum rq_cmd_type_bits { /* * try to put the fields that are referenced together in the same cacheline. - * if you modify this structure, be sure to check block/blk-core.c:rq_init() + * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init() * as well! */ struct request { -- cgit v1.2.3 From fd16d263194aa6b50b215eb593a567b59d744d6e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 13 Jun 2011 10:42:49 +0200 Subject: block: Add __attribute__((format(printf...) and fix fallout Use the compiler to verify format strings and arguments. Fix fallout. Signed-off-by: Joe Perches Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index b22fb0d3db0f..8c7c2de7631a 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -169,7 +169,8 @@ extern void blk_trace_shutdown(struct request_queue *); extern int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, struct blk_user_trace_setup *buts); -extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); +extern __attribute__((format(printf, 2, 3))) +void __trace_note_message(struct blk_trace *, const char *fmt, ...); /** * blk_add_trace_msg - Add a (simple) message to the blktrace stream -- cgit v1.2.3 From 85ef06d1d252f6a2e73b678591ab71caad4667bb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 1 Jul 2011 16:17:47 +0200 Subject: block: flush MEDIA_CHANGE from drivers on close(2) Currently, only open(2) is defined as the 'clearing' point. It has two roles - first, it's an acknowledgement from userland indicating that the event has been received and kernel can clear pending states and proceed to generate more events. Secondly, it's passed on to device drivers as a hint indicating that a synchronization point has been reached and it might want to take a deeper look at the device. The latter currently is only used by sr which uses two different mechanisms - GET_EVENT_MEDIA_STATUS_NOTIFICATION and TEST_UNIT_READY to discover events, where the former is lighter weight and safe to be used repeatedly but may not provide full coverage. Among other things, GET_EVENT can't detect media removal while TUR can. This patch makes close(2) - blkdev_put() - indicate clearing hint for MEDIA_CHANGE to drivers. disk_check_events() is renamed to disk_flush_events() and updated to take @mask for events to flush which is or'd to ev->clearing and will be passed to the driver on the next ->check_events() invocation. This change makes sr generate MEDIA_CHANGE when media is ejected from userland - e.g. with eject(1). Note: Given the current usage, it seems @clearing hint is needlessly complex. disk_clear_events() can simply clear all events and the hint can be boolean @flush. Signed-off-by: Tejun Heo Cc: Kay Sievers Signed-off-by: Jens Axboe --- include/linux/genhd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 300d7582006e..02fa4697a0e5 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -420,7 +420,7 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); -extern void disk_check_events(struct gendisk *disk); +extern void disk_flush_events(struct gendisk *disk, unsigned int mask); extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask); /* drivers/char/random.c */ -- cgit v1.2.3 From 390192b300570b2bc721d77067ca133f58015ae8 Mon Sep 17 00:00:00 2001 From: Johannes Stezenbach Date: Fri, 1 Jul 2011 22:32:26 +0200 Subject: compat_ioctl: fix warning caused by qemu On Linux x86_64 host with 32bit userspace, running qemu or even just "qemu-img create -f qcow2 some.img 1G" causes a kernel warning: ioctl32(qemu-img:5296): Unknown cmd fd(3) cmd(00005326){t:'S';sz:0} arg(7fffffff) on some.img ioctl32(qemu-img:5296): Unknown cmd fd(3) cmd(801c0204){t:02;sz:28} arg(fff77350) on some.img ioctl 00005326 is CDROM_DRIVE_STATUS, ioctl 801c0204 is FDGETPRM. The warning appears because the Linux compat-ioctl handler for these ioctls only applies to block devices, while qemu also uses the ioctls on plain files. Signed-off-by: Johannes Stezenbach Acked-by: Arnd Bergmann Signed-off-by: Jens Axboe --- include/linux/fd.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fd.h b/include/linux/fd.h index f5d194af07a8..c6a68d011608 100644 --- a/include/linux/fd.h +++ b/include/linux/fd.h @@ -3,6 +3,7 @@ #include #include +#include /* New file layout: Now the ioctl definitions immediately follow the * definitions of the structures that they use */ @@ -377,4 +378,21 @@ struct floppy_raw_cmd { #define FDEJECT _IO(2, 0x5a) /* eject the disk */ +#ifdef CONFIG_COMPAT +struct compat_floppy_struct { + compat_uint_t size; + compat_uint_t sect; + compat_uint_t head; + compat_uint_t track; + compat_uint_t stretch; + unsigned char gap; + unsigned char rate; + unsigned char spec1; + unsigned char fmt_gap; + const compat_caddr_t name; +}; + +#define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct) +#endif + #endif -- cgit v1.2.3 From 719c0c590609809365c6f3da2f923cd84dc99113 Mon Sep 17 00:00:00 2001 From: Johannes Stezenbach Date: Thu, 7 Jul 2011 08:18:18 +0200 Subject: compat_ioctl: fix make headers_check regression Fix headers_check error introduced by 390192b30057: include/linux/fd.h:6: included file 'linux/compat.h' is not exported Signed-off-by: Johannes Stezenbach Signed-off-by: Jens Axboe --- include/linux/fd.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fd.h b/include/linux/fd.h index c6a68d011608..72202b1b9a6a 100644 --- a/include/linux/fd.h +++ b/include/linux/fd.h @@ -3,7 +3,6 @@ #include #include -#include /* New file layout: Now the ioctl definitions immediately follow the * definitions of the structures that they use */ @@ -378,7 +377,11 @@ struct floppy_raw_cmd { #define FDEJECT _IO(2, 0x5a) /* eject the disk */ + +#ifdef __KERNEL__ #ifdef CONFIG_COMPAT +#include + struct compat_floppy_struct { compat_uint_t size; compat_uint_t sect; @@ -394,5 +397,6 @@ struct compat_floppy_struct { #define FDGETPRM32 _IOR(2, 0x04, struct compat_floppy_struct) #endif +#endif #endif -- cgit v1.2.3 From 55c022bbddb2c056b5dff1bd1b1758d31b6d64c9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 Jul 2011 08:19:20 +0200 Subject: block: avoid building too big plug list When I test fio script with big I/O depth, I found the total throughput drops compared to some relative small I/O depth. The reason is the thread accumulates big requests in its plug list and causes some delays (surely this depends on CPU speed). I thought we'd better have a threshold for requests. When a threshold reaches, this means there is no request merge and queue lock contention isn't severe when pushing per-task requests to queue, so the main advantages of blk plug don't exist. We can force a plug list flush in this case. With this, my test throughput actually increases and almost equals to small I/O depth. Another side effect is irq off time decreases in blk_flush_plug_list() for big I/O depth. The BLK_MAX_REQUEST_COUNT is choosen arbitarily, but 16 is efficiently to reduce lock contention to me. But I'm open here, 32 is ok in my test too. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 36f2e2b99ae3..92edb9601242 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -862,7 +862,10 @@ struct blk_plug { struct list_head list; struct list_head cb_list; unsigned int should_sort; + unsigned int count; }; +#define BLK_MAX_REQUEST_COUNT 16 + struct blk_plug_cb { struct list_head list; void (*callback)(struct blk_plug_cb *); -- cgit v1.2.3 From 316cc67d5e03801a5ee4ac660a4dfe9e02aed475 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 8 Jul 2011 08:19:21 +0200 Subject: block: document blk_plug list access I'm often confused why not disable preempt when changing blk_plug list. It would be better to add comments here in case others have the similar concerns. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 92edb9601242..6dcea6885a5d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -857,6 +857,12 @@ struct request_queue *blk_alloc_queue(gfp_t); struct request_queue *blk_alloc_queue_node(gfp_t, int); extern void blk_put_queue(struct request_queue *); +/* + * Note: Code in between changing the blk_plug list/cb_list or element of such + * lists is preemptable, but such code can't do sleep (or be very careful), + * otherwise data is corrupted. For details, please check schedule() where + * blk_schedule_flush_plug() is called. + */ struct blk_plug { unsigned long magic; struct list_head list; -- cgit v1.2.3 From 4aede84b33d6beb401136a3deca0651ae07c5e99 Mon Sep 17 00:00:00 2001 From: Justin TerAvest Date: Tue, 12 Jul 2011 08:31:45 +0200 Subject: fixlet: Remove fs_excl from struct task. fs_excl is a poor man's priority inheritance for filesystems to hint to the block layer that an operation is important. It was never clearly specified, not widely adopted, and will not prevent starvation in many cases (like across cgroups). fs_excl was introduced with the time sliced CFQ IO scheduler, to indicate when a process held FS exclusive resources and thus needed a boost. It doesn't cover all file systems, and it was never fully complete. Lets kill it. Signed-off-by: Justin TerAvest Signed-off-by: Jens Axboe --- include/linux/fs.h | 4 ---- include/linux/init_task.h | 1 - include/linux/sched.h | 1 - 3 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e73e2e9ae33..f6c866c287b5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1453,10 +1453,6 @@ enum { #define vfs_check_frozen(sb, level) \ wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level))) -#define get_fs_excl() atomic_inc(¤t->fs_excl) -#define put_fs_excl() atomic_dec(¤t->fs_excl) -#define has_fs_excl() atomic_read(¤t->fs_excl) - /* * until VFS tracks user namespaces for inodes, just make all files * belong to init_user_ns diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 580f70c02391..d14e058aaeed 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -176,7 +176,6 @@ extern struct cred init_cred; .alloc_lock = __SPIN_LOCK_UNLOCKED(tsk.alloc_lock), \ .journal_info = NULL, \ .cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \ - .fs_excl = ATOMIC_INIT(0), \ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock), \ .timer_slack_ns = 50000, /* 50 usec default slack */ \ .pids = { \ diff --git a/include/linux/sched.h b/include/linux/sched.h index a837b20ba190..22f54249cde1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1503,7 +1503,6 @@ struct task_struct { short il_next; short pref_node_fork; #endif - atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; /* -- cgit v1.2.3 From 383cd7213f95a2784ab5038fe292844178768b82 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 12 Jul 2011 14:24:35 +0200 Subject: CFQ: move think time check variables to a separate struct Move the variables to do think time check to a sepatate struct. This is to prepare adding think time check for service tree and group. No functional change. Signed-off-by: Shaohua Li Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index b2eee896dcbc..5037a0ad2312 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -5,6 +5,14 @@ #include struct cfq_queue; +struct cfq_ttime { + unsigned long last_end_request; + + unsigned long ttime_total; + unsigned long ttime_samples; + unsigned long ttime_mean; +}; + struct cfq_io_context { void *key; @@ -12,11 +20,7 @@ struct cfq_io_context { struct io_context *ioc; - unsigned long last_end_request; - - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; + struct cfq_ttime ttime; struct list_head queue_list; struct hlist_node cic_list; -- cgit v1.2.3 From d7b7630130e52361af66ce3b994696e2357ba7de Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Wed, 13 Jul 2011 21:17:23 +0200 Subject: block: reorder request_queue to remove 64 bit alignment padding Reorder request_queue to remove 16 bytes of alignment padding in 64 bit builds. On my config this shrinks the size of this structure from 1608 to 1592 bytes and therefore needs one fewer cachelines. Also trivially move the open bracket { to be on the same line as the structure name to make it easier to grep. Signed-off-by: Richard Kennedy Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6dcea6885a5d..c0cd9a2f22ef 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -260,8 +260,7 @@ struct queue_limits { unsigned char discard_zeroes_data; }; -struct request_queue -{ +struct request_queue { /* * Together with queue_head for cacheline sharing */ @@ -304,14 +303,14 @@ struct request_queue void *queuedata; /* - * queue needs bounce pages for pages above this limit + * various queue flags, see QUEUE_* below */ - gfp_t bounce_gfp; + unsigned long queue_flags; /* - * various queue flags, see QUEUE_* below + * queue needs bounce pages for pages above this limit */ - unsigned long queue_flags; + gfp_t bounce_gfp; /* * protects queue structures from reentrancy. ->__queue_lock should @@ -334,8 +333,8 @@ struct request_queue unsigned int nr_congestion_off; unsigned int nr_batching; - void *dma_drain_buffer; unsigned int dma_drain_size; + void *dma_drain_buffer; unsigned int dma_pad_mask; unsigned int dma_alignment; -- cgit v1.2.3 From 5757a6d76cdf6dda2a492c09b985c015e86779b1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 23 Jul 2011 20:44:25 +0200 Subject: block: strict rq_affinity Some systems benefit from completions always being steered to the strict requester cpu rather than the looser "per-socket" steering that blk_cpu_to_group() attempts by default. This is because the first CPU in the group mask ends up being completely overloaded with work, while the others (including the original submitter) has power left to spare. Allow the strict mode to be set by writing '2' to the sysfs control file. This is identical to the scheme used for the nomerges file, where '2' is a more aggressive setting than just being turned on. echo 2 > /sys/block//queue/rq_affinity Cc: Christoph Hellwig Cc: Roland Dreier Tested-by: Dave Jiang Signed-off-by: Dan Williams Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c0cd9a2f22ef..0e67c45b3bc9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -392,7 +392,7 @@ struct request_queue { #define QUEUE_FLAG_ELVSWITCH 6 /* don't use elevator, just do FIFO */ #define QUEUE_FLAG_BIDI 7 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 8 /* disable merge attempts */ -#define QUEUE_FLAG_SAME_COMP 9 /* force complete on same CPU */ +#define QUEUE_FLAG_SAME_COMP 9 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 10 /* fake timeout */ #define QUEUE_FLAG_STACKABLE 11 /* supports request stacking */ #define QUEUE_FLAG_NONROT 12 /* non-rotational device (SSD) */ @@ -402,6 +402,7 @@ struct request_queue { #define QUEUE_FLAG_NOXMERGES 15 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 16 /* Contributes to random pool */ #define QUEUE_FLAG_SECDISCARD 17 /* supports SECDISCARD */ +#define QUEUE_FLAG_SAME_FORCE 18 /* force complete on same CPU */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3