summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c19
-rw-r--r--drivers/md/bcache/super.c17
-rw-r--r--drivers/md/dm-bufio.c5
-rw-r--r--drivers/md/dm-io.c1
-rw-r--r--drivers/md/dm-ioctl.c4
-rw-r--r--drivers/md/dm-verity-target.c65
-rw-r--r--drivers/md/dm-verity.h1
-rw-r--r--drivers/md/dm.c3
-rw-r--r--drivers/md/md-cluster.c4
-rw-r--r--drivers/md/md.c10
-rw-r--r--drivers/md/raid10.c7
-rw-r--r--drivers/md/raid5.c30
12 files changed, 135 insertions, 31 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 4d46f2ce606f..aa84fcfd59fc 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -514,15 +514,21 @@ struct open_bucket {
/*
* We keep multiple buckets open for writes, and try to segregate different
- * write streams for better cache utilization: first we look for a bucket where
- * the last write to it was sequential with the current write, and failing that
- * we look for a bucket that was last used by the same task.
+ * write streams for better cache utilization: first we try to segregate flash
+ * only volume write streams from cached devices, secondly we look for a bucket
+ * where the last write to it was sequential with the current write, and
+ * failing that we look for a bucket that was last used by the same task.
*
* The ideas is if you've got multiple tasks pulling data into the cache at the
* same time, you'll get better cache utilization if you try to segregate their
* data and preserve locality.
*
- * For example, say you've starting Firefox at the same time you're copying a
+ * For example, dirty sectors of flash only volume is not reclaimable, if their
+ * dirty sectors mixed with dirty sectors of cached device, such buckets will
+ * be marked as dirty and won't be reclaimed, though the dirty data of cached
+ * device have been written back to backend device.
+ *
+ * And say you've starting Firefox at the same time you're copying a
* bunch of files. Firefox will likely end up being fairly hot and stay in the
* cache awhile, but the data you copied might not be; if you wrote all that
* data to the same buckets it'd get invalidated at the same time.
@@ -539,7 +545,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
struct open_bucket *ret, *ret_task = NULL;
list_for_each_entry_reverse(ret, &c->data_buckets, list)
- if (!bkey_cmp(&ret->key, search))
+ if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) !=
+ UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
+ continue;
+ else if (!bkey_cmp(&ret->key, search))
goto found;
else if (ret->last_write_point == write_point)
ret_task = ret;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index c2248b75f2da..b9a526271f02 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -890,6 +890,12 @@ static void cached_dev_detach_finish(struct work_struct *w)
mutex_lock(&bch_register_lock);
+ cancel_delayed_work_sync(&dc->writeback_rate_update);
+ if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
+ kthread_stop(dc->writeback_thread);
+ dc->writeback_thread = NULL;
+ }
+
memset(&dc->sb.set_uuid, 0, 16);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
@@ -935,6 +941,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
uint32_t rtime = cpu_to_le32(get_seconds());
struct uuid_entry *u;
char buf[BDEVNAME_SIZE];
+ struct cached_dev *exist_dc, *t;
bdevname(dc->bdev, buf);
@@ -958,6 +965,16 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
return -EINVAL;
}
+ /* Check whether already attached */
+ list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
+ if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
+ pr_err("Tried to attach %s but duplicate UUID already attached",
+ buf);
+
+ return -EINVAL;
+ }
+ }
+
u = uuid_find(c, dc->sb.uuid);
if (u &&
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 969c815c90b6..d566c32e222a 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -818,7 +818,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
* dm-bufio is resistant to allocation failures (it just keeps
* one buffer reserved in cases all the allocations fail).
* So set flags to not try too hard:
- * GFP_NOIO: don't recurse into the I/O layer
+ * GFP_NOWAIT: don't wait; if we need to sleep we'll release our
+ * mutex and wait ourselves.
* __GFP_NORETRY: don't retry and rather return failure
* __GFP_NOMEMALLOC: don't use emergency reserves
* __GFP_NOWARN: don't print a warning in case of failure
@@ -828,7 +829,7 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
*/
while (1) {
if (dm_bufio_cache_size_latch != 1) {
- b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+ b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
if (b)
return b;
}
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 81c5e1a1f363..1b84d2890fbf 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -300,6 +300,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
else if (rw & REQ_WRITE_SAME)
special_cmd_max_sectors = q->limits.max_write_same_sectors;
if ((rw & (REQ_DISCARD | REQ_WRITE_SAME)) && special_cmd_max_sectors == 0) {
+ atomic_inc(&io->count);
dec_count(io, region, -EOPNOTSUPP);
return;
}
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 7baeeafa059d..065d7cee0d21 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1773,12 +1773,12 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
cmd == DM_LIST_VERSIONS_CMD)
return 0;
- if ((cmd == DM_DEV_CREATE_CMD)) {
+ if (cmd == DM_DEV_CREATE_CMD) {
if (!*param->name) {
DMWARN("name not supplied when creating device");
return -EINVAL;
}
- } else if ((*param->uuid && *param->name)) {
+ } else if (*param->uuid && *param->name) {
DMWARN("only supply one of name or uuid, cmd(%u)", cmd);
return -EINVAL;
}
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index e34cf53bd068..ceff074b3b74 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/reboot.h>
+#include <linux/vmalloc.h>
#define DM_MSG_PREFIX "verity"
@@ -32,6 +33,7 @@
#define DM_VERITY_OPT_LOGGING "ignore_corruption"
#define DM_VERITY_OPT_RESTART "restart_on_corruption"
#define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks"
+#define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once"
#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC)
@@ -399,6 +401,18 @@ static int verity_bv_zero(struct dm_verity *v, struct dm_verity_io *io,
}
/*
+ * Moves the bio iter one data block forward.
+ */
+static inline void verity_bv_skip_block(struct dm_verity *v,
+ struct dm_verity_io *io,
+ struct bvec_iter *iter)
+{
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_bio_data_size);
+
+ bio_advance_iter(bio, iter, 1 << v->data_dev_block_bits);
+}
+
+/*
* Verify one "dm_verity_io" structure.
*/
static int verity_verify_io(struct dm_verity_io *io)
@@ -410,9 +424,16 @@ static int verity_verify_io(struct dm_verity_io *io)
for (b = 0; b < io->n_blocks; b++) {
int r;
+ sector_t cur_block = io->block + b;
struct shash_desc *desc = verity_io_hash_desc(v, io);
- r = verity_hash_for_block(v, io, io->block + b,
+ if (v->validated_blocks &&
+ likely(test_bit(cur_block, v->validated_blocks))) {
+ verity_bv_skip_block(v, io, &io->iter);
+ continue;
+ }
+
+ r = verity_hash_for_block(v, io, cur_block,
verity_io_want_digest(v, io),
&is_zero);
if (unlikely(r < 0))
@@ -445,13 +466,16 @@ static int verity_verify_io(struct dm_verity_io *io)
return r;
if (likely(memcmp(verity_io_real_digest(v, io),
- verity_io_want_digest(v, io), v->digest_size) == 0))
+ verity_io_want_digest(v, io), v->digest_size) == 0)) {
+ if (v->validated_blocks)
+ set_bit(cur_block, v->validated_blocks);
continue;
+ }
else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
- io->block + b, NULL, &start) == 0)
+ cur_block, NULL, &start) == 0)
continue;
else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
- io->block + b))
+ cur_block))
return -EIO;
}
@@ -645,6 +669,8 @@ void verity_status(struct dm_target *ti, status_type_t type,
args += DM_VERITY_OPTS_FEC;
if (v->zero_digest)
args++;
+ if (v->validated_blocks)
+ args++;
if (!args)
return;
DMEMIT(" %u", args);
@@ -663,6 +689,8 @@ void verity_status(struct dm_target *ti, status_type_t type,
}
if (v->zero_digest)
DMEMIT(" " DM_VERITY_OPT_IGN_ZEROES);
+ if (v->validated_blocks)
+ DMEMIT(" " DM_VERITY_OPT_AT_MOST_ONCE);
sz = verity_fec_status_table(v, sz, result, maxlen);
break;
}
@@ -716,6 +744,7 @@ void verity_dtr(struct dm_target *ti)
if (v->bufio)
dm_bufio_client_destroy(v->bufio);
+ vfree(v->validated_blocks);
kfree(v->salt);
kfree(v->root_digest);
kfree(v->zero_digest);
@@ -737,6 +766,26 @@ void verity_dtr(struct dm_target *ti)
}
EXPORT_SYMBOL_GPL(verity_dtr);
+static int verity_alloc_most_once(struct dm_verity *v)
+{
+ struct dm_target *ti = v->ti;
+
+ /* the bitset can only handle INT_MAX blocks */
+ if (v->data_blocks > INT_MAX) {
+ ti->error = "device too large to use check_at_most_once";
+ return -E2BIG;
+ }
+
+ v->validated_blocks = vzalloc(BITS_TO_LONGS(v->data_blocks) *
+ sizeof(unsigned long));
+ if (!v->validated_blocks) {
+ ti->error = "failed to allocate bitset for check_at_most_once";
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
static int verity_alloc_zero_digest(struct dm_verity *v)
{
int r = -ENOMEM;
@@ -806,6 +855,12 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
}
continue;
+ } else if (!strcasecmp(arg_name, DM_VERITY_OPT_AT_MOST_ONCE)) {
+ r = verity_alloc_most_once(v);
+ if (r)
+ return r;
+ continue;
+
} else if (verity_is_fec_opt_arg(arg_name)) {
r = verity_fec_parse_opt_args(as, v, &argc, arg_name);
if (r)
@@ -1074,7 +1129,7 @@ EXPORT_SYMBOL_GPL(verity_ctr);
static struct target_type verity_target = {
.name = "verity",
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
.module = THIS_MODULE,
.ctr = verity_ctr,
.dtr = verity_dtr,
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index a90d1d416107..d216fc76d350 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -63,6 +63,7 @@ struct dm_verity {
sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
struct dm_verity_fec *fec; /* forward error correction */
+ unsigned long *validated_blocks; /* bitset blocks validated */
};
struct dm_verity_io {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f7f560f5f056..f002d2ce9c9f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -974,7 +974,8 @@ static void dec_pending(struct dm_io *io, int error)
} else {
/* done with normal IO or empty flush */
trace_block_bio_complete(md->queue, bio, io_error);
- bio->bi_error = io_error;
+ if (io_error)
+ bio->bi_error = io_error;
bio_endio(bio);
}
}
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 494d01d0e92a..a7a561af05c9 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -945,8 +945,10 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
lock_comm(cinfo);
ret = __sendmsg(cinfo, &cmsg);
- if (ret)
+ if (ret) {
+ unlock_comm(cinfo);
return ret;
+ }
cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1cd819202553..3a9685fe115c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1028,8 +1028,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
* (not needed for Linear and RAID0 as metadata doesn't
* record this size)
*/
- if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
- rdev->sectors = (2ULL << 32) - 2;
+ if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
+ sb->level >= 1)
+ rdev->sectors = (sector_t)(2ULL << 32) - 2;
if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
/* "this cannot possibly happen" ... */
@@ -1322,8 +1323,9 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
/* Limit to 4TB as metadata cannot record more than that.
* 4TB == 2^32 KB, or 2*2^32 sectors.
*/
- if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
- num_sectors = (2ULL << 32) - 2;
+ if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
+ rdev->mddev->level >= 1)
+ num_sectors = (sector_t)(2ULL << 32) - 2;
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page);
md_super_wait(rdev->mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a67e1a36733f..45e7a47e5f7b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2698,6 +2698,11 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
conf->nr_queued++;
spin_unlock_irq(&conf->device_lock);
+ /*
+ * In case freeze_array() is waiting for condition
+ * nr_pending == nr_queued + extra to be true.
+ */
+ wake_up(&conf->wait_barrier);
md_wakeup_thread(conf->mddev->thread);
} else {
if (test_bit(R10BIO_WriteError,
@@ -3633,6 +3638,7 @@ static int run(struct mddev *mddev)
if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
discard_supported = true;
+ first = 0;
}
if (mddev->queue) {
@@ -4039,6 +4045,7 @@ static int raid10_start_reshape(struct mddev *mddev)
diff = 0;
if (first || diff < min_offset_diff)
min_offset_diff = diff;
+ first = 0;
}
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77403228e098..9284acea4f7b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -110,8 +110,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
- local_irq_disable();
- spin_lock(conf->hash_locks);
+ spin_lock_irq(conf->hash_locks);
for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
spin_lock(&conf->device_lock);
@@ -121,9 +120,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
spin_unlock(&conf->device_lock);
- for (i = NR_STRIPE_HASH_LOCKS; i; i--)
- spin_unlock(conf->hash_locks + i - 1);
- local_irq_enable();
+ for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
+ spin_unlock(conf->hash_locks + i);
+ spin_unlock_irq(conf->hash_locks);
}
/* bio's attached to a stripe+device for I/O are linked together in bi_sector
@@ -726,12 +725,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
- local_irq_disable();
if (sh1 > sh2) {
- spin_lock(&sh2->stripe_lock);
+ spin_lock_irq(&sh2->stripe_lock);
spin_lock_nested(&sh1->stripe_lock, 1);
} else {
- spin_lock(&sh1->stripe_lock);
+ spin_lock_irq(&sh1->stripe_lock);
spin_lock_nested(&sh2->stripe_lock, 1);
}
}
@@ -739,8 +737,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
spin_unlock(&sh1->stripe_lock);
- spin_unlock(&sh2->stripe_lock);
- local_irq_enable();
+ spin_unlock_irq(&sh2->stripe_lock);
}
/* Only freshly new full stripe normal write stripe can be added to a batch list */
@@ -3372,9 +3369,20 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
BUG_ON(test_bit(R5_Wantread, &dev->flags));
BUG_ON(sh->batch_head);
+
+ /*
+ * In the raid6 case if the only non-uptodate disk is P
+ * then we already trusted P to compute the other failed
+ * drives. It is safe to compute rather than re-read P.
+ * In other cases we only compute blocks from failed
+ * devices, otherwise check/repair might fail to detect
+ * a real inconsistency.
+ */
+
if ((s->uptodate == disks - 1) &&
+ ((sh->qd_idx >= 0 && sh->pd_idx == disk_idx) ||
(s->failed && (disk_idx == s->failed_num[0] ||
- disk_idx == s->failed_num[1]))) {
+ disk_idx == s->failed_num[1])))) {
/* have disk failed, and we're requested to fetch it;
* do compute it
*/