diff options
Diffstat (limited to 'drivers/md')
49 files changed, 1191 insertions, 589 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 46794cac167e..5310e1f4a282 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -49,7 +49,7 @@ * * bch_bucket_alloc() allocates a single bucket from a specific cache. * - * bch_bucket_alloc_set() allocates one or more buckets from different caches + * bch_bucket_alloc_set() allocates one bucket from different caches * out of a cache set. * * free_some_buckets() drives all the processes described above. It's called @@ -488,34 +488,29 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k) } int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait) + struct bkey *k, bool wait) { - int i; + struct cache *ca; + long b; /* No allocation if CACHE_SET_IO_DISABLE bit is set */ if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) return -1; lockdep_assert_held(&c->bucket_lock); - BUG_ON(!n || n > c->caches_loaded || n > 8); bkey_init(k); - /* sort by free space/prio of oldest data in caches */ - - for (i = 0; i < n; i++) { - struct cache *ca = c->cache_by_alloc[i]; - long b = bch_bucket_alloc(ca, reserve, wait); + ca = c->cache_by_alloc[0]; + b = bch_bucket_alloc(ca, reserve, wait); + if (b == -1) + goto err; - if (b == -1) - goto err; + k->ptr[0] = MAKE_PTR(ca->buckets[b].gen, + bucket_to_sector(c, b), + ca->sb.nr_this_dev); - k->ptr[i] = MAKE_PTR(ca->buckets[b].gen, - bucket_to_sector(c, b), - ca->sb.nr_this_dev); - - SET_KEY_PTRS(k, i + 1); - } + SET_KEY_PTRS(k, 1); return 0; err: @@ -525,12 +520,12 @@ err: } int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait) + struct bkey *k, bool wait) { int ret; mutex_lock(&c->bucket_lock); - ret = __bch_bucket_alloc_set(c, reserve, k, n, wait); + ret = __bch_bucket_alloc_set(c, reserve, k, wait); mutex_unlock(&c->bucket_lock); return ret; } @@ -638,7 +633,7 @@ bool bch_alloc_sectors(struct cache_set *c, spin_unlock(&c->data_bucket_lock); - if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait)) + if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait)) return false; spin_lock(&c->data_bucket_lock); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6a380ed4919a..d0311e3065ca 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -265,6 +265,7 @@ struct bcache_device { #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 int nr_stripes; +#define BCH_MIN_STRIPE_SZ ((4 << 20) >> SECTOR_SHIFT) unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; @@ -952,9 +953,9 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k); long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait); int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait); + struct bkey *k, bool wait); int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve, - struct bkey *k, int n, bool wait); + struct bkey *k, bool wait); bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned int sectors, unsigned int write_point, unsigned int write_prio, bool wait); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index e388e7bb7b5d..de1eb7961fe6 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1008,6 +1008,9 @@ err: * * The btree node will have either a read or a write lock held, depending on * level and op->lock. + * + * Note: Only error code or btree pointer will be returned, it is unncessary + * for callers to check NULL pointer. */ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, struct bkey *k, int level, bool write, @@ -1120,16 +1123,22 @@ retry: mutex_unlock(&b->c->bucket_lock); } +/* + * Only error code or btree pointer will be returned, it is unncessary for + * callers to check NULL pointer. + */ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, int level, bool wait, struct btree *parent) { BKEY_PADDED(key) k; - struct btree *b = ERR_PTR(-EAGAIN); + struct btree *b; mutex_lock(&c->bucket_lock); retry: - if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) + /* return ERR_PTR(-EAGAIN) when it fails */ + b = ERR_PTR(-EAGAIN); + if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err; bkey_put(c, &k.key); @@ -1174,7 +1183,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b, { struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bkey_copy_key(&n->key, &b->key); @@ -1389,7 +1398,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, for (i = 0; i < nodes; i++) { new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); - if (IS_ERR_OR_NULL(new_nodes[i])) + if (IS_ERR(new_nodes[i])) goto out_nocoalesce; } @@ -1541,6 +1550,8 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, return 0; n = btree_node_alloc_replacement(replace, NULL); + if (IS_ERR(n)) + return 0; /* recheck reserve after allocating replacement node */ if (btree_check_reserve(b, NULL)) { @@ -1706,7 +1717,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op, if (should_rewrite) { n = btree_node_alloc_replacement(b, NULL); - if (!IS_ERR_OR_NULL(n)) { + if (!IS_ERR(n)) { bch_btree_node_write_sync(n); bch_btree_set_root(n); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index c1e487d1261c..958c3a1c453a 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1102,6 +1102,12 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) * which would call closure_get(&dc->disk.cl) */ ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO); + if (!ddip) { + bio->bi_status = BLK_STS_RESOURCE; + bio->bi_end_io(bio); + return; + } + ddip->d = d; ddip->start_time = jiffies; ddip->bi_end_io = bio->bi_end_io; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7787ec42f81e..70f0f3096bee 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -423,7 +423,7 @@ static int __uuid_write(struct cache_set *c) closure_init_stack(&cl); lockdep_assert_held(&bch_register_lock); - if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true)) + if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true)) return 1; SET_KEY_SIZE(&k.key, c->sb.bucket_size); @@ -807,6 +807,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, if (!d->stripe_size) d->stripe_size = 1 << 31; + else if (d->stripe_size < BCH_MIN_STRIPE_SZ) + d->stripe_size = roundup(BCH_MIN_STRIPE_SZ, d->stripe_size); d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size); @@ -824,20 +826,20 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long); d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL); if (!d->full_dirty_stripes) - return -ENOMEM; + goto out_free_stripe_sectors_dirty; idx = ida_simple_get(&bcache_device_idx, 0, BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); if (idx < 0) - return idx; + goto out_free_full_dirty_stripes; if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) - goto err; + goto out_ida_remove; d->disk = alloc_disk(BCACHE_MINORS); if (!d->disk) - goto err; + goto out_bioset_exit; set_capacity(d->disk, sectors); snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); @@ -872,8 +874,14 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, return 0; -err: +out_bioset_exit: + bioset_exit(&d->bio_split); +out_ida_remove: ida_simple_remove(&bcache_device_idx, idx); +out_free_full_dirty_stripes: + kvfree(d->full_dirty_stripes); +out_free_stripe_sectors_dirty: + kvfree(d->stripe_sectors_dirty); return -ENOMEM; } @@ -1570,7 +1578,7 @@ static void cache_set_flush(struct closure *cl) if (!IS_ERR_OR_NULL(c->gc_thread)) kthread_stop(c->gc_thread); - if (!IS_ERR_OR_NULL(c->root)) + if (!IS_ERR(c->root)) list_add(&c->root->list, &c->btree_cache); /* Should skip this if we're unregistering because of an error */ @@ -1838,7 +1846,7 @@ static int run_cache_set(struct cache_set *c) c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL); - if (IS_ERR_OR_NULL(c->root)) + if (IS_ERR(c->root)) goto err; list_del_init(&c->root->list); @@ -1915,7 +1923,7 @@ static int run_cache_set(struct cache_set *c) err = "cannot allocate new btree root"; c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL); - if (IS_ERR_OR_NULL(c->root)) + if (IS_ERR(c->root)) goto err; mutex_lock(&c->root->write_lock); diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 591d9c8107dd..64a72222a58c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -992,7 +992,7 @@ SHOW(__bch_cache) sum += INITIAL_PRIO - cached[i]; if (n) - do_div(sum, n); + sum = div64_u64(sum, n); for (i = 0; i < ARRAY_SIZE(q); i++) q[i] = INITIAL_PRIO - cached[n * (i + 1) / diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index dc385b70e4c3..b3b799f84dcc 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1463,6 +1463,10 @@ EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) { sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT; + if (s >= c->start) + s -= c->start; + else + s = 0; if (likely(c->sectors_per_block_bits >= 0)) s >>= c->sectors_per_block_bits; else @@ -1471,6 +1475,12 @@ sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) } EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); +struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c) +{ + return c->dm_io; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client); + sector_t dm_bufio_get_block_number(struct dm_buffer *b) { return b->block; diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index af6d4f898e4c..2ecd0db0f294 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -551,11 +551,13 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, return r; } -static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd) +static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd, + bool destroy_bm) { dm_sm_destroy(cmd->metadata_sm); dm_tm_destroy(cmd->tm); - dm_block_manager_destroy(cmd->bm); + if (destroy_bm) + dm_block_manager_destroy(cmd->bm); } typedef unsigned long (*flags_mutator)(unsigned long); @@ -826,7 +828,7 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, cmd2 = lookup(bdev); if (cmd2) { mutex_unlock(&table_lock); - __destroy_persistent_data_objects(cmd); + __destroy_persistent_data_objects(cmd, true); kfree(cmd); return cmd2; } @@ -874,7 +876,7 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd) mutex_unlock(&table_lock); if (!cmd->fail_io) - __destroy_persistent_data_objects(cmd); + __destroy_persistent_data_objects(cmd, true); kfree(cmd); } } @@ -1808,14 +1810,52 @@ int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result) int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) { - int r; + int r = -EINVAL; + struct dm_block_manager *old_bm = NULL, *new_bm = NULL; + + /* fail_io is double-checked with cmd->root_lock held below */ + if (unlikely(cmd->fail_io)) + return r; + + /* + * Replacement block manager (new_bm) is created and old_bm destroyed outside of + * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of + * shrinker associated with the block manager's bufio client vs cmd root_lock). + * - must take shrinker_rwsem without holding cmd->root_lock + */ + new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, + CACHE_MAX_CONCURRENT_LOCKS); WRITE_LOCK(cmd); - __destroy_persistent_data_objects(cmd); - r = __create_persistent_data_objects(cmd, false); + if (cmd->fail_io) { + WRITE_UNLOCK(cmd); + goto out; + } + + __destroy_persistent_data_objects(cmd, false); + old_bm = cmd->bm; + if (IS_ERR(new_bm)) { + DMERR("could not create block manager during abort"); + cmd->bm = NULL; + r = PTR_ERR(new_bm); + goto out_unlock; + } + + cmd->bm = new_bm; + r = __open_or_format_metadata(cmd, false); + if (r) { + cmd->bm = NULL; + goto out_unlock; + } + new_bm = NULL; +out_unlock: if (r) cmd->fail_io = true; WRITE_UNLOCK(cmd); + dm_block_manager_destroy(old_bm); +out: + if (new_bm && !IS_ERR(new_bm)) + dm_block_manager_destroy(new_bm); return r; } diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 1b5b9ad9e492..6030193b216e 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -854,7 +854,13 @@ struct smq_policy { struct background_tracker *bg_work; - bool migrations_allowed; + bool migrations_allowed:1; + + /* + * If this is set the policy will try and clean the whole cache + * even if the device is not idle. + */ + bool cleaner:1; }; /*----------------------------------------------------------------*/ @@ -1133,7 +1139,7 @@ static bool clean_target_met(struct smq_policy *mq, bool idle) * Cache entries may not be populated. So we cannot rely on the * size of the clean queue. */ - if (idle) { + if (idle || mq->cleaner) { /* * We'd like to clean everything. */ @@ -1716,11 +1722,9 @@ static void calc_hotspot_params(sector_t origin_size, *hotspot_block_size /= 2u; } -static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, - sector_t origin_size, - sector_t cache_block_size, - bool mimic_mq, - bool migrations_allowed) +static struct dm_cache_policy * +__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size, + bool mimic_mq, bool migrations_allowed, bool cleaner) { unsigned i; unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS; @@ -1807,6 +1811,7 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size, goto bad_btracker; mq->migrations_allowed = migrations_allowed; + mq->cleaner = cleaner; return &mq->policy; @@ -1830,21 +1835,24 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - return __smq_create(cache_size, origin_size, cache_block_size, false, true); + return __smq_create(cache_size, origin_size, cache_block_size, + false, true, false); } static struct dm_cache_policy *mq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - return __smq_create(cache_size, origin_size, cache_block_size, true, true); + return __smq_create(cache_size, origin_size, cache_block_size, + true, true, false); } static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size) { - return __smq_create(cache_size, origin_size, cache_block_size, false, false); + return __smq_create(cache_size, origin_size, cache_block_size, + false, false, true); } /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 2ddd575e97f7..b3371812a215 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -1010,16 +1010,16 @@ static void abort_transaction(struct cache *cache) if (get_cache_mode(cache) >= CM_READ_ONLY) return; - if (dm_cache_metadata_set_needs_check(cache->cmd)) { - DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); - set_cache_mode(cache, CM_FAIL); - } - DMERR_LIMIT("%s: aborting current metadata transaction", dev_name); if (dm_cache_metadata_abort(cache->cmd)) { DMERR("%s: failed to abort metadata transaction", dev_name); set_cache_mode(cache, CM_FAIL); } + + if (dm_cache_metadata_set_needs_check(cache->cmd)) { + DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name); + set_cache_mode(cache, CM_FAIL); + } } static void metadata_operation_failed(struct cache *cache, const char *op, int r) @@ -1905,6 +1905,7 @@ static void process_deferred_bios(struct work_struct *ws) else commit_needed = process_bio(cache, bio) || commit_needed; + cond_resched(); } if (commit_needed) @@ -1927,6 +1928,7 @@ static void requeue_deferred_bios(struct cache *cache) while ((bio = bio_list_pop(&bios))) { bio->bi_status = BLK_STS_DM_REQUEUE; bio_endio(bio); + cond_resched(); } } @@ -1967,6 +1969,8 @@ static void check_migrations(struct work_struct *ws) r = mg_start(cache, op, NULL); if (r) break; + + cond_resched(); } } @@ -1987,6 +1991,7 @@ static void destroy(struct cache *cache) if (cache->prison) dm_bio_prison_destroy_v2(cache->prison); + cancel_delayed_work_sync(&cache->waker); if (cache->wq) destroy_workqueue(cache->wq); diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index 7e426e4d1352..8cda3f7ddbae 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -110,6 +110,10 @@ struct mapped_device { /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; + int swap_bios; + struct semaphore swap_bios_semaphore; + struct mutex swap_bios_lock; + struct dm_stats stats; struct kthread_worker kworker; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 07661c3c1513..908bf0768827 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1661,6 +1661,7 @@ pop_from_list: io = crypt_io_from_node(rb_first(&write_tree)); rb_erase(&io->rb_node, &write_tree); kcryptd_io_write(io); + cond_resched(); } while (!RB_EMPTY_ROOT(&write_tree)); blk_finish_plug(&plug); } @@ -2107,7 +2108,7 @@ static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string static int get_key_size(char **key_string) { - return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1; + return (*key_string[0] == ':') ? -EINVAL : (int)(strlen(*key_string) >> 1); } #endif @@ -2181,7 +2182,12 @@ static void *crypt_page_alloc(gfp_t gfp_mask, void *pool_data) struct crypt_config *cc = pool_data; struct page *page; - if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) && + /* + * Note, percpu_counter_read_positive() may over (and under) estimate + * the current usage by at most (batch - 1) * num_online_cpus() pages, + * but avoids potential spinlock contention of an exact result. + */ + if (unlikely(percpu_counter_read_positive(&cc->n_allocated_pages) >= dm_crypt_pages_per_client) && likely(gfp_mask & __GFP_NORETRY)) return NULL; @@ -2852,6 +2858,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) wake_up_process(cc->write_thread); ti->num_flush_bios = 1; + ti->limit_swap_bios = true; return 0; @@ -2926,6 +2933,11 @@ static int crypt_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static char hex2asc(unsigned char c) +{ + return c + '0' + ((unsigned)(9 - c) >> 4 & 0x27); +} + static void crypt_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { @@ -2944,9 +2956,12 @@ static void crypt_status(struct dm_target *ti, status_type_t type, if (cc->key_size > 0) { if (cc->key_string) DMEMIT(":%u:%s", cc->key_size, cc->key_string); - else - for (i = 0; i < cc->key_size; i++) - DMEMIT("%02x", cc->key[i]); + else { + for (i = 0; i < cc->key_size; i++) { + DMEMIT("%c%c", hex2asc(cc->key[i] >> 4), + hex2asc(cc->key[i] & 0xf)); + } + } } else DMEMIT("-"); diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index f496213f8b67..7c0e7c662e07 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -30,7 +30,7 @@ struct delay_c { struct workqueue_struct *kdelayd_wq; struct work_struct flush_expired_bios; struct list_head delayed_bios; - atomic_t may_delay; + bool may_delay; struct delay_class read; struct delay_class write; @@ -191,7 +191,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) INIT_WORK(&dc->flush_expired_bios, flush_expired_bios); INIT_LIST_HEAD(&dc->delayed_bios); mutex_init(&dc->timer_lock); - atomic_set(&dc->may_delay, 1); + dc->may_delay = true; dc->argc = argc; ret = delay_class_ctr(ti, &dc->read, argv); @@ -245,7 +245,7 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) struct dm_delay_info *delayed; unsigned long expires = 0; - if (!c->delay || !atomic_read(&dc->may_delay)) + if (!c->delay) return DM_MAPIO_REMAPPED; delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info)); @@ -254,6 +254,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio) delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay); mutex_lock(&delayed_bios_lock); + if (unlikely(!dc->may_delay)) { + mutex_unlock(&delayed_bios_lock); + return DM_MAPIO_REMAPPED; + } c->ops++; list_add_tail(&delayed->list, &dc->delayed_bios); mutex_unlock(&delayed_bios_lock); @@ -267,7 +271,10 @@ static void delay_presuspend(struct dm_target *ti) { struct delay_c *dc = ti->private; - atomic_set(&dc->may_delay, 0); + mutex_lock(&delayed_bios_lock); + dc->may_delay = false; + mutex_unlock(&delayed_bios_lock); + del_timer_sync(&dc->delay_timer); flush_bios(flush_delayed_bios(dc, 1)); } @@ -276,7 +283,7 @@ static void delay_resume(struct dm_target *ti) { struct delay_c *dc = ti->private; - atomic_set(&dc->may_delay, 1); + dc->may_delay = true; } static int delay_map(struct dm_target *ti, struct bio *bio) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 8e48920a3ffa..da88237f1fe4 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -46,6 +46,7 @@ struct writeset { static void writeset_free(struct writeset *ws) { vfree(ws->bits); + ws->bits = NULL; } static int setup_on_disk_bitset(struct dm_disk_bitset *info, @@ -70,8 +71,6 @@ static size_t bitset_size(unsigned nr_bits) */ static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) { - ws->md.nr_bits = nr_blocks; - ws->md.root = INVALID_WRITESET_ROOT; ws->bits = vzalloc(bitset_size(nr_blocks)); if (!ws->bits) { DMERR("%s: couldn't allocate in memory bitset", __func__); @@ -84,12 +83,14 @@ static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) /* * Wipes the in-core bitset, and creates a new on disk bitset. */ -static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws) +static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws, + dm_block_t nr_blocks) { int r; - memset(ws->bits, 0, bitset_size(ws->md.nr_bits)); + memset(ws->bits, 0, bitset_size(nr_blocks)); + ws->md.nr_bits = nr_blocks; r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root); if (r) { DMERR("%s: setup_on_disk_bitset failed", __func__); @@ -133,7 +134,7 @@ static int writeset_test_and_set(struct dm_disk_bitset *info, { int r; - if (!test_and_set_bit(block, ws->bits)) { + if (!test_bit(block, ws->bits)) { r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root); if (r) { /* FIXME: fail mode */ @@ -387,7 +388,7 @@ static void ws_dec(void *context, const void *value) static int ws_eq(void *context, const void *value1, const void *value2) { - return !memcmp(value1, value2, sizeof(struct writeset_metadata)); + return !memcmp(value1, value2, sizeof(struct writeset_disk)); } /*----------------------------------------------------------------*/ @@ -563,6 +564,15 @@ static int open_metadata(struct era_metadata *md) } disk = dm_block_data(sblock); + + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk->data_block_size) != md->block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk->data_block_size), md->block_size); + r = -EINVAL; + goto bad; + } + r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION, disk->metadata_space_map_root, sizeof(disk->metadata_space_map_root), @@ -574,10 +584,10 @@ static int open_metadata(struct era_metadata *md) setup_infos(md); - md->block_size = le32_to_cpu(disk->data_block_size); md->nr_blocks = le32_to_cpu(disk->nr_blocks); md->current_era = le32_to_cpu(disk->current_era); + ws_unpack(&disk->current_writeset, &md->current_writeset->md); md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root); md->era_array_root = le64_to_cpu(disk->era_array_root); md->metadata_snap = le64_to_cpu(disk->metadata_snap); @@ -745,6 +755,12 @@ static int metadata_digest_lookup_writeset(struct era_metadata *md, ws_unpack(&disk, &d->writeset); d->value = cpu_to_le32(key); + /* + * We initialise another bitset info to avoid any caching side effects + * with the previous one. + */ + dm_disk_bitset_init(md->tm, &d->info); + d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks); d->current_bit = 0; d->step = metadata_digest_transcribe_writeset; @@ -758,12 +774,6 @@ static int metadata_digest_start(struct era_metadata *md, struct digest *d) return 0; memset(d, 0, sizeof(*d)); - - /* - * We initialise another bitset info to avoid any caching side - * effects with the previous one. - */ - dm_disk_bitset_init(md->tm, &d->info); d->step = metadata_digest_lookup_writeset; return 0; @@ -801,6 +811,8 @@ static struct era_metadata *metadata_open(struct block_device *bdev, static void metadata_close(struct era_metadata *md) { + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); destroy_persistent_data_objects(md); kfree(md); } @@ -838,6 +850,7 @@ static int metadata_resize(struct era_metadata *md, void *arg) r = writeset_alloc(&md->writesets[1], *new_size); if (r) { DMERR("%s: writeset_alloc failed for writeset 1", __func__); + writeset_free(&md->writesets[0]); return r; } @@ -848,6 +861,8 @@ static int metadata_resize(struct era_metadata *md, void *arg) &value, &md->era_array_root); if (r) { DMERR("%s: dm_array_resize failed", __func__); + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); return r; } @@ -869,7 +884,6 @@ static int metadata_era_archive(struct era_metadata *md) } ws_pack(&md->current_writeset->md, &value); - md->current_writeset->md.root = INVALID_WRITESET_ROOT; keys[0] = md->current_era; __dm_bless_for_disk(&value); @@ -881,6 +895,7 @@ static int metadata_era_archive(struct era_metadata *md) return r; } + md->current_writeset->md.root = INVALID_WRITESET_ROOT; md->archived_writesets = true; return 0; @@ -897,7 +912,7 @@ static int metadata_new_era(struct era_metadata *md) int r; struct writeset *new_writeset = next_writeset(md); - r = writeset_init(&md->bitset_info, new_writeset); + r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks); if (r) { DMERR("%s: writeset_init failed", __func__); return r; @@ -950,7 +965,7 @@ static int metadata_commit(struct era_metadata *md) int r; struct dm_block *sblock; - if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) { + if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) { r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, &md->current_writeset->md.root); if (r) { @@ -1225,8 +1240,10 @@ static void process_deferred_bios(struct era *era) int r; struct bio_list deferred_bios, marked_bios; struct bio *bio; + struct blk_plug plug; bool commit_needed = false; bool failed = false; + struct writeset *ws = era->md->current_writeset; bio_list_init(&deferred_bios); bio_list_init(&marked_bios); @@ -1236,9 +1253,11 @@ static void process_deferred_bios(struct era *era) bio_list_init(&era->deferred_bios); spin_unlock(&era->deferred_lock); + if (bio_list_empty(&deferred_bios)) + return; + while ((bio = bio_list_pop(&deferred_bios))) { - r = writeset_test_and_set(&era->md->bitset_info, - era->md->current_writeset, + r = writeset_test_and_set(&era->md->bitset_info, ws, get_block(era, bio)); if (r < 0) { /* @@ -1246,7 +1265,6 @@ static void process_deferred_bios(struct era *era) * FIXME: finish. */ failed = true; - } else if (r == 0) commit_needed = true; @@ -1262,9 +1280,19 @@ static void process_deferred_bios(struct era *era) if (failed) while ((bio = bio_list_pop(&marked_bios))) bio_io_error(bio); - else - while ((bio = bio_list_pop(&marked_bios))) + else { + blk_start_plug(&plug); + while ((bio = bio_list_pop(&marked_bios))) { + /* + * Only update the in-core writeset if the on-disk one + * was updated too. + */ + if (commit_needed) + set_bit(get_block(era, bio), ws->bits); generic_make_request(bio); + } + blk_finish_plug(&plug); + } } static void process_rpc_calls(struct era *era) @@ -1368,7 +1396,7 @@ static void start_worker(struct era *era) static void stop_worker(struct era *era) { atomic_set(&era->suspended, 1); - flush_workqueue(era->wq); + drain_workqueue(era->wq); } /*---------------------------------------------------------------- @@ -1485,15 +1513,6 @@ static int era_ctr(struct dm_target *ti, unsigned argc, char **argv) } era->md = md; - era->nr_blocks = calc_nr_blocks(era); - - r = metadata_resize(era->md, &era->nr_blocks); - if (r) { - ti->error = "couldn't resize metadata"; - era_destroy(era); - return -ENOMEM; - } - era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); if (!era->wq) { ti->error = "could not create workqueue for metadata object"; @@ -1561,6 +1580,12 @@ static void era_postsuspend(struct dm_target *ti) } stop_worker(era); + + r = metadata_commit(era->md); + if (r) { + DMERR("%s: metadata_commit failed", __func__); + /* FIXME: fail mode */ + } } static int era_preresume(struct dm_target *ti) @@ -1570,16 +1595,24 @@ static int era_preresume(struct dm_target *ti) dm_block_t new_size = calc_nr_blocks(era); if (era->nr_blocks != new_size) { - r = in_worker1(era, metadata_resize, &new_size); - if (r) + r = metadata_resize(era->md, &new_size); + if (r) { + DMERR("%s: metadata_resize failed", __func__); return r; + } + + r = metadata_commit(era->md); + if (r) { + DMERR("%s: metadata_commit failed", __func__); + return r; + } era->nr_blocks = new_size; } start_worker(era); - r = in_worker0(era, metadata_new_era); + r = in_worker0(era, metadata_era_rollover); if (r) { DMERR("%s: metadata_era_rollover failed", __func__); return r; diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 2fcf62fb2844..5116856ea81d 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -124,9 +124,9 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc, * Direction r or w? */ arg_name = dm_shift_arg(as); - if (!strcasecmp(arg_name, "w")) + if (arg_name && !strcasecmp(arg_name, "w")) fc->corrupt_bio_rw = WRITE; - else if (!strcasecmp(arg_name, "r")) + else if (arg_name && !strcasecmp(arg_name, "r")) fc->corrupt_bio_rw = READ; else { ti->error = "Invalid corrupt bio direction (r or w)"; @@ -301,8 +301,11 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc) */ bio_for_each_segment(bvec, bio, iter) { if (bio_iter_len(bio, iter) > corrupt_bio_byte) { - char *segment = (page_address(bio_iter_page(bio, iter)) - + bio_iter_offset(bio, iter)); + char *segment; + struct page *page = bio_iter_page(bio, iter); + if (unlikely(page == ZERO_PAGE(0))) + break; + segment = (page_address(page) + bio_iter_offset(bio, iter)); segment[corrupt_bio_byte] = fc->corrupt_bio_value; DMDEBUG("Corrupting data bio=%p by writing %u to byte %u " "(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n", @@ -364,9 +367,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio) /* * Corrupt matching writes. */ - if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) { - if (all_corrupt_bio_flags_match(bio, fc)) - corrupt_bio_data(bio, fc); + if (fc->corrupt_bio_byte) { + if (fc->corrupt_bio_rw == WRITE) { + if (all_corrupt_bio_flags_match(bio, fc)) + corrupt_bio_data(bio, fc); + } goto map_bio; } @@ -397,13 +402,14 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, } if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) { - if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) && - all_corrupt_bio_flags_match(bio, fc)) { - /* - * Corrupt successful matching READs while in down state. - */ - corrupt_bio_data(bio, fc); - + if (fc->corrupt_bio_byte) { + if ((fc->corrupt_bio_rw == READ) && + all_corrupt_bio_flags_match(bio, fc)) { + /* + * Corrupt successful matching READs while in down state. + */ + corrupt_bio_data(bio, fc); + } } else if (!test_bit(DROP_WRITES, &fc->flags) && !test_bit(ERROR_WRITES, &fc->flags)) { /* diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index e8eeee680750..a884fcf65063 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -29,11 +29,11 @@ #define DEFAULT_BUFFER_SECTORS 128 #define DEFAULT_JOURNAL_WATERMARK 50 #define DEFAULT_SYNC_MSEC 10000 -#define DEFAULT_MAX_JOURNAL_SECTORS 131072 +#define DEFAULT_MAX_JOURNAL_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 131072 : 8192) #define MIN_LOG2_INTERLEAVE_SECTORS 3 #define MAX_LOG2_INTERLEAVE_SECTORS 31 #define METADATA_WORKQUEUE_MAX_ACTIVE 16 -#define RECALC_SECTORS 8192 +#define RECALC_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 32768 : 2048) #define RECALC_WRITE_SUPER 16 /* @@ -240,6 +240,7 @@ struct dm_integrity_c { bool journal_uptodate; bool just_formatted; + bool legacy_recalculate; struct alg_spec internal_hash_alg; struct alg_spec journal_crypt_alg; @@ -345,6 +346,14 @@ static int dm_integrity_failed(struct dm_integrity_c *ic) return READ_ONCE(ic->failed); } +static bool dm_integrity_disable_recalculate(struct dm_integrity_c *ic) +{ + if ((ic->internal_hash_alg.key || ic->journal_mac_alg.key) && + !ic->legacy_recalculate) + return true; + return false; +} + static commit_id_t dm_integrity_commit_id(struct dm_integrity_c *ic, unsigned i, unsigned j, unsigned char seq) { @@ -1153,12 +1162,52 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se return 0; } -static void dm_integrity_flush_buffers(struct dm_integrity_c *ic) +struct flush_request { + struct dm_io_request io_req; + struct dm_io_region io_reg; + struct dm_integrity_c *ic; + struct completion comp; +}; + +static void flush_notify(unsigned long error, void *fr_) +{ + struct flush_request *fr = fr_; + if (unlikely(error != 0)) + dm_integrity_io_error(fr->ic, "flusing disk cache", -EIO); + complete(&fr->comp); +} + +static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_data) { int r; + + struct flush_request fr; + + if (!ic->meta_dev) + flush_data = false; + if (flush_data) { + fr.io_req.bi_op = REQ_OP_WRITE, + fr.io_req.bi_op_flags = REQ_PREFLUSH | REQ_SYNC, + fr.io_req.mem.type = DM_IO_KMEM, + fr.io_req.mem.ptr.addr = NULL, + fr.io_req.notify.fn = flush_notify, + fr.io_req.notify.context = &fr; + fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio), + fr.io_reg.bdev = ic->dev->bdev, + fr.io_reg.sector = 0, + fr.io_reg.count = 0, + fr.ic = ic; + init_completion(&fr.comp); + r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL); + BUG_ON(r); + } + r = dm_bufio_write_dirty_buffers(ic->bufio); if (unlikely(r)) dm_integrity_io_error(ic, "writing tags", r); + + if (flush_data) + wait_for_completion(&fr.comp); } static void sleep_on_endio_wait(struct dm_integrity_c *ic) @@ -1330,11 +1379,12 @@ static void integrity_metadata(struct work_struct *w) checksums = checksums_onstack; __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { + struct bio_vec bv_copy = bv; unsigned pos; char *mem, *checksums_ptr; again: - mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset; + mem = (char *)kmap_atomic(bv_copy.bv_page) + bv_copy.bv_offset; pos = 0; checksums_ptr = checksums; do { @@ -1343,7 +1393,7 @@ again: sectors_to_process -= ic->sectors_per_block; pos += ic->sectors_per_block << SECTOR_SHIFT; sector += ic->sectors_per_block; - } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack); + } while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack); kunmap_atomic(mem); r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset, @@ -1363,9 +1413,9 @@ again: if (!sectors_to_process) break; - if (unlikely(pos < bv.bv_len)) { - bv.bv_offset += pos; - bv.bv_len -= pos; + if (unlikely(pos < bv_copy.bv_len)) { + bv_copy.bv_offset += pos; + bv_copy.bv_len -= pos; goto again; } } @@ -1846,7 +1896,7 @@ static void integrity_commit(struct work_struct *w) flushes = bio_list_get(&ic->flush_bio_list); if (unlikely(ic->mode != 'J')) { spin_unlock_irq(&ic->endio_wait.lock); - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, true); goto release_flush_bios; } @@ -2057,7 +2107,7 @@ skip_io: complete_journal_op(&comp); wait_for_completion_io(&comp.comp); - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, true); } static void integrity_writer(struct work_struct *w) @@ -2067,10 +2117,6 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; - /* the following test is not needed, but it tests the replay code */ - if (unlikely(dm_post_suspending(ic->ti)) && !ic->meta_dev) - return; - spin_lock_irq(&ic->endio_wait.lock); write_start = ic->committed_section; write_sections = ic->n_committed_sections; @@ -2099,7 +2145,7 @@ static void recalc_write_super(struct dm_integrity_c *ic) { int r; - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, false); if (dm_integrity_failed(ic)) return; @@ -2406,10 +2452,9 @@ static void dm_integrity_postsuspend(struct dm_target *ti) drain_workqueue(ic->commit_wq); if (ic->mode == 'J') { - if (ic->meta_dev) - queue_work(ic->writer_wq, &ic->writer_work); + queue_work(ic->writer_wq, &ic->writer_work); drain_workqueue(ic->writer_wq); - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, true); } BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); @@ -2463,6 +2508,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, arg_count += !!ic->internal_hash_alg.alg_string; arg_count += !!ic->journal_crypt_alg.alg_string; arg_count += !!ic->journal_mac_alg.alg_string; + arg_count += ic->legacy_recalculate; DMEMIT("%s %llu %u %c %u", ic->dev->name, (unsigned long long)ic->start, ic->tag_size, ic->mode, arg_count); if (ic->meta_dev) @@ -2476,6 +2522,8 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" buffer_sectors:%u", 1U << ic->log2_buffer_sectors); DMEMIT(" journal_watermark:%u", (unsigned)watermark_percentage); DMEMIT(" commit_time:%u", ic->autocommit_msec); + if (ic->legacy_recalculate) + DMEMIT(" legacy_recalculate"); #define EMIT_ALG(a, n) \ do { \ @@ -3078,7 +3126,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) unsigned extra_args; struct dm_arg_set as; static const struct dm_arg _args[] = { - {0, 9, "Invalid number of feature args"}, + {0, 12, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; bool recalculate; @@ -3208,6 +3256,8 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } else if (!strcmp(opt_string, "recalculate")) { recalculate = true; + } else if (!strcmp(opt_string, "legacy_recalculate")) { + ic->legacy_recalculate = true; } else { r = -EINVAL; ti->error = "Invalid argument"; @@ -3450,6 +3500,7 @@ try_smaller_buffer: } if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + size_t recalc_tags_size; if (!ic->internal_hash) { r = -EINVAL; ti->error = "Recalculate is only valid with internal hash"; @@ -3468,13 +3519,29 @@ try_smaller_buffer: r = -ENOMEM; goto bad; } - ic->recalc_tags = kvmalloc_array(RECALC_SECTORS >> ic->sb->log2_sectors_per_block, - ic->tag_size, GFP_KERNEL); + recalc_tags_size = (RECALC_SECTORS >> ic->sb->log2_sectors_per_block) * ic->tag_size; + if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size) + recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size; + ic->recalc_tags = kvmalloc(recalc_tags_size, GFP_KERNEL); if (!ic->recalc_tags) { ti->error = "Cannot allocate tags for recalculating"; r = -ENOMEM; goto bad; } + } else { + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { + ti->error = "Recalculate can only be specified with internal_hash"; + r = -EINVAL; + goto bad; + } + } + + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors && + dm_integrity_disable_recalculate(ic)) { + ti->error = "Recalculating with HMAC is disabled for security reasons - if you really need it, use the argument \"legacy_recalculate\""; + r = -EOPNOTSUPP; + goto bad; } ic->bufio = dm_bufio_client_create(ic->meta_dev ? ic->meta_dev->bdev : ic->dev->bdev, @@ -3494,8 +3561,6 @@ try_smaller_buffer: } if (should_write_sb) { - int r; - init_journal(ic, 0, ic->journal_sections, 0); r = dm_integrity_failed(ic); if (unlikely(r)) { @@ -3625,11 +3690,13 @@ int __init dm_integrity_init(void) } r = dm_register_target(&integrity_target); - - if (r < 0) + if (r < 0) { DMERR("register failed %d", r); + kmem_cache_destroy(journal_io_cache); + return r; + } - return r; + return 0; } void dm_integrity_exit(void) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 81ffc59d05c9..4312007d2d34 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -306,7 +306,7 @@ static void do_region(int op, int op_flags, unsigned region, struct request_queue *q = bdev_get_queue(where->bdev); unsigned short logical_block_size = queue_logical_block_size(q); sector_t num_sectors; - unsigned int uninitialized_var(special_cmd_max_sectors); + unsigned int special_cmd_max_sectors; /* * Reject unsupported discard and write same requests. diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index f666778ad237..88e89796ccbf 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -17,6 +17,7 @@ #include <linux/dm-ioctl.h> #include <linux/hdreg.h> #include <linux/compat.h> +#include <linux/nospec.h> #include <linux/uaccess.h> @@ -529,7 +530,7 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ * Grab our output buffer. */ nl = orig_nl = get_result_buffer(param, param_size, &len); - if (len < needed) { + if (len < needed || len < sizeof(nl->dev)) { param->flags |= DM_BUFFER_FULL_FLAG; goto out; } @@ -572,7 +573,7 @@ static void list_version_get_needed(struct target_type *tt, void *needed_param) size_t *needed = needed_param; *needed += sizeof(struct dm_target_versions); - *needed += strlen(tt->name); + *needed += strlen(tt->name) + 1; *needed += ALIGN_MASK; } @@ -627,7 +628,7 @@ static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param iter_info.old_vers = NULL; iter_info.vers = vers; iter_info.flags = 0; - iter_info.end = (char *)vers+len; + iter_info.end = (char *)vers + needed; /* * Now loop through filling out the names & versions. @@ -1409,11 +1410,12 @@ static int table_clear(struct file *filp, struct dm_ioctl *param, size_t param_s hc->new_map = NULL; } - param->flags &= ~DM_INACTIVE_PRESENT_FLAG; - - __dev_status(hc->md, param); md = hc->md; up_write(&_hash_lock); + + param->flags &= ~DM_INACTIVE_PRESENT_FLAG; + __dev_status(md, param); + if (old_map) { dm_sync_table(md); dm_table_destroy(old_map); @@ -1575,6 +1577,7 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para if (!argc) { DMWARN("Empty message received."); + r = -EINVAL; goto out_argv; } @@ -1669,6 +1672,7 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags) if (unlikely(cmd >= ARRAY_SIZE(_ioctls))) return NULL; + cmd = array_index_nospec(cmd, ARRAY_SIZE(_ioctls)); *ioctl_flags = _ioctls[cmd].flags; return _ioctls[cmd].fn; } @@ -1818,7 +1822,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us int ioctl_flags; int param_flags; unsigned int cmd; - struct dm_ioctl *uninitialized_var(param); + struct dm_ioctl *param; ioctl_fn fn = NULL; size_t input_param_size; struct dm_ioctl param_kernel; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 23de59a692c5..72aa5097b68f 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -998,12 +998,13 @@ static int validate_region_size(struct raid_set *rs, unsigned long region_size) static int validate_raid_redundancy(struct raid_set *rs) { unsigned int i, rebuild_cnt = 0; - unsigned int rebuilds_per_group = 0, copies; + unsigned int rebuilds_per_group = 0, copies, raid_disks; unsigned int group_size, last_group_start; - for (i = 0; i < rs->md.raid_disks; i++) - if (!test_bit(In_sync, &rs->dev[i].rdev.flags) || - !rs->dev[i].rdev.sb_page) + for (i = 0; i < rs->raid_disks; i++) + if (!test_bit(FirstUse, &rs->dev[i].rdev.flags) && + ((!test_bit(In_sync, &rs->dev[i].rdev.flags) || + !rs->dev[i].rdev.sb_page))) rebuild_cnt++; switch (rs->md.level) { @@ -1043,8 +1044,9 @@ static int validate_raid_redundancy(struct raid_set *rs) * A A B B C * C D D E E */ + raid_disks = min(rs->raid_disks, rs->md.raid_disks); if (__is_raid10_near(rs->md.new_layout)) { - for (i = 0; i < rs->md.raid_disks; i++) { + for (i = 0; i < raid_disks; i++) { if (!(i % copies)) rebuilds_per_group = 0; if ((!rs->dev[i].rdev.sb_page || @@ -1067,10 +1069,10 @@ static int validate_raid_redundancy(struct raid_set *rs) * results in the need to treat the last (potentially larger) * set differently. */ - group_size = (rs->md.raid_disks / copies); - last_group_start = (rs->md.raid_disks / group_size) - 1; + group_size = (raid_disks / copies); + last_group_start = (raid_disks / group_size) - 1; last_group_start *= group_size; - for (i = 0; i < rs->md.raid_disks; i++) { + for (i = 0; i < raid_disks; i++) { if (!(i % copies) && !(i > last_group_start)) rebuilds_per_group = 0; if ((!rs->dev[i].rdev.sb_page || @@ -1585,7 +1587,7 @@ static sector_t __rdev_sectors(struct raid_set *rs) { int i; - for (i = 0; i < rs->md.raid_disks; i++) { + for (i = 0; i < rs->raid_disks; i++) { struct md_rdev *rdev = &rs->dev[i].rdev; if (!test_bit(Journal, &rdev->flags) && @@ -1892,6 +1894,14 @@ static bool rs_takeover_requested(struct raid_set *rs) return rs->md.new_level != rs->md.level; } +/* True if layout is set to reshape. */ +static bool rs_is_layout_change(struct raid_set *rs, bool use_mddev) +{ + return (use_mddev ? rs->md.delta_disks : rs->delta_disks) || + rs->md.new_layout != rs->md.layout || + rs->md.new_chunk_sectors != rs->md.chunk_sectors; +} + /* True if @rs is requested to reshape by ctr */ static bool rs_reshape_requested(struct raid_set *rs) { @@ -1904,9 +1914,7 @@ static bool rs_reshape_requested(struct raid_set *rs) if (rs_is_raid0(rs)) return false; - change = mddev->new_layout != mddev->layout || - mddev->new_chunk_sectors != mddev->chunk_sectors || - rs->delta_disks; + change = rs_is_layout_change(rs, false); /* Historical case to support raid1 reshape without delta disks */ if (rs_is_raid1(rs)) { @@ -2843,7 +2851,7 @@ static sector_t _get_reshape_sectors(struct raid_set *rs) } /* - * + * Reshape: * - change raid layout * - change chunk size * - add disks @@ -2953,6 +2961,20 @@ static int rs_setup_reshape(struct raid_set *rs) } /* + * If the md resync thread has updated superblock with max reshape position + * at the end of a reshape but not (yet) reset the layout configuration + * changes -> reset the latter. + */ +static void rs_reset_inconclusive_reshape(struct raid_set *rs) +{ + if (!rs_is_reshaping(rs) && rs_is_layout_change(rs, true)) { + rs_set_cur(rs); + rs->md.delta_disks = 0; + rs->md.reshape_backwards = 0; + } +} + +/* * Enable/disable discard support on RAID set depending on * RAID level and discard properties of underlying RAID members. */ @@ -3221,11 +3243,14 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (r) goto bad; + /* Catch any inconclusive reshape superblock content. */ + rs_reset_inconclusive_reshape(rs); + /* Start raid set read-only and assumed clean to change in raid_resume() */ rs->md.ro = 1; rs->md.in_sync = 1; - /* Keep array frozen */ + /* Keep array frozen until resume. */ set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); /* Has to be held on running the array */ @@ -3239,7 +3264,6 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) } r = md_start(&rs->md); - if (r) { ti->error = "Failed to start raid array"; mddev_unlock(&rs->md); @@ -3265,15 +3289,19 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Try to adjust the raid4/5/6 stripe cache size to the stripe size */ if (rs_is_raid456(rs)) { r = rs_set_raid456_stripe_cache(rs); - if (r) + if (r) { + mddev_unlock(&rs->md); goto bad_stripe_cache; + } } /* Now do an early reshape check */ if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) { r = rs_check_reshape(rs); - if (r) + if (r) { + mddev_unlock(&rs->md); goto bad_check_reshape; + } /* Restore new, ctr requested layout to perform check */ rs_config_restore(rs, &rs_layout); @@ -3282,6 +3310,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) r = rs->md.pers->check_reshape(&rs->md); if (r) { ti->error = "Reshape check failed"; + mddev_unlock(&rs->md); goto bad_check_reshape; } } @@ -3509,7 +3538,7 @@ static void raid_status(struct dm_target *ti, status_type_t type, { struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; - struct r5conf *conf = mddev->private; + struct r5conf *conf = rs_is_raid456(rs) ? mddev->private : NULL; int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0; unsigned long recovery; unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */ @@ -3729,13 +3758,13 @@ static int raid_iterate_devices(struct dm_target *ti, unsigned int i; int r = 0; - for (i = 0; !r && i < rs->md.raid_disks; i++) - if (rs->dev[i].data_dev) - r = fn(ti, - rs->dev[i].data_dev, - 0, /* No offset on data devs */ - rs->md.dev_sectors, - data); + for (i = 0; !r && i < rs->raid_disks; i++) { + if (rs->dev[i].data_dev) { + r = fn(ti, rs->dev[i].data_dev, + 0, /* No offset on data devs */ + rs->md.dev_sectors, data); + } + } return r; } @@ -3780,7 +3809,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs) memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices)); - for (i = 0; i < mddev->raid_disks; i++) { + for (i = 0; i < rs->raid_disks; i++) { r = &rs->dev[i].rdev; /* HM FIXME: enhance journal device recovery processing */ if (test_bit(Journal, &r->flags)) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 9fde174ce396..2957a3763f01 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -831,6 +831,7 @@ out_tag_set: blk_mq_free_tag_set(md->tag_set); out_kfree_tag_set: kfree(md->tag_set); + md->tag_set = NULL; return err; } @@ -840,6 +841,7 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md) if (md->tag_set) { blk_mq_free_tag_set(md->tag_set); kfree(md->tag_set); + md->tag_set = NULL; } } diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 963d3774c93e..247089c2be25 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -613,7 +613,7 @@ static int persistent_read_metadata(struct dm_exception_store *store, chunk_t old, chunk_t new), void *callback_context) { - int r, uninitialized_var(new_snapshot); + int r, new_snapshot; struct pstore *ps = get_info(store); /* diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index d3f28a9e3fd9..52101e5c7258 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -137,6 +137,11 @@ struct dm_snapshot { * for them to be committed. */ struct bio_list bios_queued_during_merge; + + /* + * Flush data after merge. + */ + struct bio flush_bio; }; /* @@ -789,7 +794,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) static uint32_t __minimum_chunk_size(struct origin *o) { struct dm_snapshot *snap; - unsigned chunk_size = 0; + unsigned chunk_size = rounddown_pow_of_two(UINT_MAX); if (o) list_for_each_entry(snap, &o->snapshots, list) @@ -1061,6 +1066,17 @@ shut: static void error_bios(struct bio *bio); +static int flush_data(struct dm_snapshot *s) +{ + struct bio *flush_bio = &s->flush_bio; + + bio_reset(flush_bio); + bio_set_dev(flush_bio, s->origin->bdev); + flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + return submit_bio_wait(flush_bio); +} + static void merge_callback(int read_err, unsigned long write_err, void *context) { struct dm_snapshot *s = context; @@ -1074,6 +1090,11 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) goto shut; } + if (flush_data(s) < 0) { + DMERR("Flush after merge failed: shutting down merge"); + goto shut; + } + if (s->store->type->commit_merge(s->store, s->num_merging_chunks) < 0) { DMERR("Write error in exception store: shutting down merge"); @@ -1198,6 +1219,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->first_merging_chunk = 0; s->num_merging_chunks = 0; bio_list_init(&s->bios_queued_during_merge); + bio_init(&s->flush_bio, NULL, 0); /* Allocate hash table for COW data */ if (init_hash_tables(s)) { @@ -1264,6 +1286,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (!s->store->chunk_size) { ti->error = "Chunk size not set"; + r = -EINVAL; goto bad_read_metadata; } @@ -1391,6 +1414,8 @@ static void snapshot_dtr(struct dm_target *ti) mutex_destroy(&s->lock); + bio_uninit(&s->flush_bio); + dm_put_device(ti, s->cow); dm_put_device(ti, s->origin); diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c index 21de30b4e2a1..0eb48e739f7e 100644 --- a/drivers/md/dm-stats.c +++ b/drivers/md/dm-stats.c @@ -188,7 +188,7 @@ static int dm_stat_in_flight(struct dm_stat_shared *shared) atomic_read(&shared->in_flight[WRITE]); } -void dm_stats_init(struct dm_stats *stats) +int dm_stats_init(struct dm_stats *stats) { int cpu; struct dm_stats_last_position *last; @@ -196,11 +196,16 @@ void dm_stats_init(struct dm_stats *stats) mutex_init(&stats->mutex); INIT_LIST_HEAD(&stats->list); stats->last = alloc_percpu(struct dm_stats_last_position); + if (!stats->last) + return -ENOMEM; + for_each_possible_cpu(cpu) { last = per_cpu_ptr(stats->last, cpu); last->last_sector = (sector_t)ULLONG_MAX; last->last_rw = UINT_MAX; } + + return 0; } void dm_stats_cleanup(struct dm_stats *stats) @@ -224,6 +229,7 @@ void dm_stats_cleanup(struct dm_stats *stats) atomic_read(&shared->in_flight[READ]), atomic_read(&shared->in_flight[WRITE])); } + cond_resched(); } dm_stat_free(&s->rcu_head); } @@ -313,6 +319,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { atomic_set(&s->stat_shared[ni].in_flight[READ], 0); atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0); + cond_resched(); } if (s->n_histogram_entries) { @@ -325,6 +332,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { s->stat_shared[ni].tmp.histogram = hi; hi += s->n_histogram_entries + 1; + cond_resched(); } } @@ -345,6 +353,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, for (ni = 0; ni < n_entries; ni++) { p[ni].histogram = hi; hi += s->n_histogram_entries + 1; + cond_resched(); } } } @@ -474,6 +483,7 @@ static int dm_stats_list(struct dm_stats *stats, const char *program, } DMEMIT("\n"); } + cond_resched(); } mutex_unlock(&stats->mutex); @@ -750,6 +760,7 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end, local_irq_enable(); } } + cond_resched(); } } @@ -865,6 +876,8 @@ static int dm_stats_print(struct dm_stats *stats, int id, if (unlikely(sz + 1 >= maxlen)) goto buffer_overflow; + + cond_resched(); } if (clear) diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h index 2ddfae678f32..dcac11fce03b 100644 --- a/drivers/md/dm-stats.h +++ b/drivers/md/dm-stats.h @@ -22,7 +22,7 @@ struct dm_stats_aux { unsigned long long duration_ns; }; -void dm_stats_init(struct dm_stats *st); +int dm_stats_init(struct dm_stats *st); void dm_stats_cleanup(struct dm_stats *st); struct mapped_device; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 36275c59e4e7..3faaf21be5b6 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -431,14 +431,23 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, { int r; dev_t dev; + unsigned int major, minor; + char dummy; struct dm_dev_internal *dd; struct dm_table *t = ti->table; BUG_ON(!t); - dev = dm_get_dev_t(path); - if (!dev) - return -ENODEV; + if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { + /* Extract the major/minor numbers */ + dev = MKDEV(major, minor); + if (MAJOR(dev) != major || MINOR(dev) != minor) + return -EOVERFLOW; + } else { + dev = dm_get_dev_t(path); + if (!dev) + return -ENODEV; + } dd = find_device(&t->devices, dev); if (!dd) { @@ -662,7 +671,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table, */ unsigned short remaining = 0; - struct dm_target *uninitialized_var(ti); + struct dm_target *ti; struct queue_limits ti_limits; unsigned i; @@ -882,10 +891,10 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) } EXPORT_SYMBOL_GPL(dm_table_set_type); -static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, +static int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - return bdev_dax_supported(dev->bdev, PAGE_SIZE); + return !bdev_dax_supported(dev->bdev, PAGE_SIZE); } static bool dm_table_supports_dax(struct dm_table *t) @@ -901,7 +910,7 @@ static bool dm_table_supports_dax(struct dm_table *t) return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_supports_dax, NULL)) + ti->type->iterate_devices(ti, device_not_dax_capable, NULL)) return false; } @@ -1336,12 +1345,6 @@ void dm_table_event_callback(struct dm_table *t, void dm_table_event(struct dm_table *t) { - /* - * You can no longer call dm_table_event() from interrupt - * context, use a bottom half instead. - */ - BUG_ON(in_interrupt()); - mutex_lock(&_event_lock); if (t->event_fn) t->event_fn(t->event_context); @@ -1389,6 +1392,46 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } +/* + * type->iterate_devices() should be called when the sanity check needs to + * iterate and check all underlying data devices. iterate_devices() will + * iterate all underlying data devices until it encounters a non-zero return + * code, returned by whether the input iterate_devices_callout_fn, or + * iterate_devices() itself internally. + * + * For some target type (e.g. dm-stripe), one call of iterate_devices() may + * iterate multiple underlying devices internally, in which case a non-zero + * return code returned by iterate_devices_callout_fn will stop the iteration + * in advance. + * + * Cases requiring _any_ underlying device supporting some kind of attribute, + * should use the iteration structure like dm_table_any_dev_attr(), or call + * it directly. @func should handle semantics of positive examples, e.g. + * capable of something. + * + * Cases requiring _all_ underlying devices supporting some kind of attribute, + * should use the iteration structure like dm_table_supports_nowait() or + * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that + * uses an @anti_func that handle semantics of counter examples, e.g. not + * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data); + */ +static bool dm_table_any_dev_attr(struct dm_table *t, + iterate_devices_callout_fn func, void *data) +{ + struct dm_target *ti; + unsigned int i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, func, data)) + return true; + } + + return false; +} + static int count_device(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1425,13 +1468,13 @@ bool dm_table_has_no_data_devices(struct dm_table *table) return true; } -static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); enum blk_zoned_model *zoned_model = data; - return q && blk_queue_zoned_model(q) == *zoned_model; + return !q || blk_queue_zoned_model(q) != *zoned_model; } static bool dm_table_supports_zoned_model(struct dm_table *t, @@ -1448,37 +1491,20 @@ static bool dm_table_supports_zoned_model(struct dm_table *t, return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_zoned_model, &zoned_model)) + ti->type->iterate_devices(ti, device_not_zoned_model, &zoned_model)) return false; } return true; } -static int device_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); unsigned int *zone_sectors = data; - return q && blk_queue_zone_sectors(q) == *zone_sectors; -} - -static bool dm_table_matches_zone_sectors(struct dm_table *t, - unsigned int zone_sectors) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_matches_zone_sectors, &zone_sectors)) - return false; - } - - return true; + return !q || blk_queue_zone_sectors(q) != *zone_sectors; } static int validate_hardware_zoned_model(struct dm_table *table, @@ -1498,7 +1524,7 @@ static int validate_hardware_zoned_model(struct dm_table *table, if (!zone_sectors || !is_power_of_2(zone_sectors)) return -EINVAL; - if (!dm_table_matches_zone_sectors(table, zone_sectors)) { + if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) { DMERR("%s: zone sectors is not consistent across all devices", dm_device_name(table->md)); return -EINVAL; @@ -1688,29 +1714,12 @@ static int device_dax_write_cache_enabled(struct dm_target *ti, return false; } -static int dm_table_supports_dax_write_cache(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, - device_dax_write_cache_enabled, NULL)) - return true; - } - - return false; -} - -static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && blk_queue_nonrot(q); + return q && !blk_queue_nonrot(q); } static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, @@ -1721,43 +1730,26 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, return q && !blk_queue_add_random(q); } -static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int queue_no_sg_merge(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); -} - -static bool dm_table_all_devices_attribute(struct dm_table *t, - iterate_devices_callout_fn func) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, func, NULL)) - return false; - } - - return true; + return q && test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); } -static int device_no_partial_completion(struct dm_target *ti, struct dm_dev *dev, +static int device_is_partial_completion(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { char b[BDEVNAME_SIZE]; /* For now, NVMe devices are the only devices of this class */ - return (strncmp(bdevname(dev->bdev, b), "nvme", 4) == 0); + return (strncmp(bdevname(dev->bdev, b), "nvme", 4) != 0); } static bool dm_table_does_not_support_partial_completion(struct dm_table *t) { - return dm_table_all_devices_attribute(t, device_no_partial_completion); + return !dm_table_any_dev_attr(t, device_is_partial_completion, NULL); } static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, @@ -1884,27 +1876,6 @@ static int device_requires_stable_pages(struct dm_target *ti, return q && bdi_cap_stable_pages_required(q->backing_dev_info); } -/* - * If any underlying device requires stable pages, a table must require - * them as well. Only targets that support iterate_devices are considered: - * don't want error, zero, etc to require stable pages. - */ -static bool dm_table_requires_stable_pages(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_requires_stable_pages, NULL)) - return true; - } - - return false; -} - void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { @@ -1941,32 +1912,35 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_clear(QUEUE_FLAG_DAX, q); - if (dm_table_supports_dax_write_cache(t)) + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) dax_write_cache(t->md->dax_dev, true); /* Ensure that all underlying devices are non-rotational. */ - if (dm_table_all_devices_attribute(t, device_is_nonrot)) - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - else + if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + else + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); if (!dm_table_supports_write_same(t)) q->limits.max_write_same_sectors = 0; if (!dm_table_supports_write_zeroes(t)) q->limits.max_write_zeroes_sectors = 0; - if (dm_table_all_devices_attribute(t, queue_supports_sg_merge)) - blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q); - else + if (dm_table_any_dev_attr(t, queue_no_sg_merge, NULL)) blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q); + else + blk_queue_flag_clear(QUEUE_FLAG_NO_SG_MERGE, q); dm_table_verify_integrity(t); /* * Some devices don't use blk_integrity but still want stable pages * because they do their own checksumming. + * If any underlying device requires stable pages, a table must require + * them as well. Only targets that support iterate_devices are considered: + * don't want error, zero, etc to require stable pages. */ - if (dm_table_requires_stable_pages(t)) + if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; else q->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; @@ -1977,7 +1951,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not * have it set. */ - if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) + if (blk_queue_add_random(q) && + dm_table_any_dev_attr(t, device_is_not_random, NULL)) blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); } diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 85077f4d257a..f374f593fe55 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -660,6 +660,15 @@ static int __open_metadata(struct dm_pool_metadata *pmd) goto bad_cleanup_data_sm; } + /* + * For pool metadata opening process, root setting is redundant + * because it will be set again in __begin_transaction(). But dm + * pool aborting process really needs to get last transaction's + * root to avoid accessing broken btree. + */ + pmd->root = le64_to_cpu(disk_super->data_mapping_root); + pmd->details_root = le64_to_cpu(disk_super->device_details_root); + __setup_btree_details(pmd); dm_bm_unlock(sblock); @@ -901,7 +910,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) return -EBUSY; } - if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) { + if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) { r = __commit_transaction(pmd); if (r < 0) DMWARN("%s: __commit_transaction() failed, error = %d", diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 435a2ee4a392..a1bbf00e60e5 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2222,6 +2222,7 @@ static void process_thin_deferred_bios(struct thin_c *tc) throttle_work_update(&pool->throttle); dm_pool_issue_prefetches(pool->pmd); } + cond_resched(); } blk_finish_plug(&plug); } @@ -2305,6 +2306,7 @@ static void process_thin_deferred_cells(struct thin_c *tc) else pool->process_cell(tc, cell); } + cond_resched(); } while (!list_empty(&cells)); } @@ -2921,6 +2923,8 @@ static void __pool_destroy(struct pool *pool) dm_bio_prison_destroy(pool->prison); dm_kcopyd_client_destroy(pool->copier); + cancel_delayed_work_sync(&pool->waker); + cancel_delayed_work_sync(&pool->no_space_timeout); if (pool->wq) destroy_workqueue(pool->wq); @@ -3361,6 +3365,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) pt->low_water_blocks = low_water_blocks; pt->adjusted_pf = pt->requested_pf = pf; ti->num_flush_bios = 1; + ti->limit_swap_bios = true; /* * Only need to enable discards if the pool should pass @@ -3547,20 +3552,28 @@ static int pool_preresume(struct dm_target *ti) */ r = bind_control_target(pool, ti); if (r) - return r; + goto out; r = maybe_resize_data_dev(ti, &need_commit1); if (r) - return r; + goto out; r = maybe_resize_metadata_dev(ti, &need_commit2); if (r) - return r; + goto out; if (need_commit1 || need_commit2) (void) commit(pool); +out: + /* + * When a thin-pool is PM_FAIL, it cannot be rebuilt if + * bio is in deferred list. Therefore need to return 0 + * to allow pool_resume() to flush IO. + */ + if (r && get_pool_mode(pool) == PM_FAIL) + r = 0; - return 0; + return r; } static void pool_suspend_active_thins(struct pool *pool) @@ -4233,6 +4246,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; ti->num_flush_bios = 1; + ti->limit_swap_bios = true; ti->flush_supported = true; ti->per_io_data_size = sizeof(struct dm_thin_endio_hook); diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index bb8327999705..67b533c19e26 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -28,7 +28,8 @@ bool verity_fec_is_enabled(struct dm_verity *v) */ static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io) { - return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io); + return (struct dm_verity_fec_io *) + ((char *)io + io->v->ti->per_io_data_size - sizeof(struct dm_verity_fec_io)); } /* @@ -65,19 +66,18 @@ static int fec_decode_rs8(struct dm_verity *v, struct dm_verity_fec_io *fio, static u8 *fec_read_parity(struct dm_verity *v, u64 rsb, int index, unsigned *offset, struct dm_buffer **buf) { - u64 position, block; + u64 position, block, rem; u8 *res; position = (index + rsb) * v->fec->roots; - block = position >> v->data_dev_block_bits; - *offset = (unsigned)(position - (block << v->data_dev_block_bits)); + block = div64_u64_rem(position, v->fec->io_size, &rem); + *offset = (unsigned)rem; - res = dm_bufio_read(v->fec->bufio, v->fec->start + block, buf); + res = dm_bufio_read(v->fec->bufio, block, buf); if (unlikely(IS_ERR(res))) { DMERR("%s: FEC %llu: parity read failed (block %llu): %ld", v->data_dev->name, (unsigned long long)rsb, - (unsigned long long)(v->fec->start + block), - PTR_ERR(res)); + (unsigned long long)block, PTR_ERR(res)); *buf = NULL; } @@ -159,7 +159,7 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio, /* read the next block when we run out of parity bytes */ offset += v->fec->roots; - if (offset >= 1 << v->data_dev_block_bits) { + if (offset >= v->fec->io_size) { dm_bufio_release(buf); par = fec_read_parity(v, rsb, block_offset, &offset, &buf); @@ -675,7 +675,7 @@ int verity_fec_ctr(struct dm_verity *v) { struct dm_verity_fec *f = v->fec; struct dm_target *ti = v->ti; - u64 hash_blocks; + u64 hash_blocks, fec_blocks; int ret; if (!verity_fec_is_enabled(v)) { @@ -744,16 +744,23 @@ int verity_fec_ctr(struct dm_verity *v) return -E2BIG; } + if ((f->roots << SECTOR_SHIFT) & ((1 << v->data_dev_block_bits) - 1)) + f->io_size = 1 << v->data_dev_block_bits; + else + f->io_size = v->fec->roots << SECTOR_SHIFT; + f->bufio = dm_bufio_client_create(f->dev->bdev, - 1 << v->data_dev_block_bits, + f->io_size, 1, 0, NULL, NULL); if (IS_ERR(f->bufio)) { ti->error = "Cannot initialize FEC bufio client"; return PTR_ERR(f->bufio); } - if (dm_bufio_get_device_size(f->bufio) < - ((f->start + f->rounds * f->roots) >> v->data_dev_block_bits)) { + dm_bufio_set_sector_offset(f->bufio, f->start << (v->data_dev_block_bits - SECTOR_SHIFT)); + + fec_blocks = div64_u64(f->rounds * f->roots, v->fec->roots << SECTOR_SHIFT); + if (dm_bufio_get_device_size(f->bufio) < fec_blocks) { ti->error = "FEC device is too small"; return -E2BIG; } diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h index 6ad803b2b36c..b9f0bb0f63c4 100644 --- a/drivers/md/dm-verity-fec.h +++ b/drivers/md/dm-verity-fec.h @@ -40,6 +40,7 @@ struct dm_verity_fec { struct dm_dev *dev; /* parity data device */ struct dm_bufio_client *data_bufio; /* for data dev access */ struct dm_bufio_client *bufio; /* for parity data access */ + size_t io_size; /* IO size for roots */ sector_t start; /* parity data start in blocks */ sector_t blocks; /* number of blocks covered */ sector_t rounds; /* number of interleaving rounds */ diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index e3599b43f9eb..76d60c55d380 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -34,7 +34,7 @@ #define DM_VERITY_OPT_IGN_ZEROES "ignore_zero_blocks" #define DM_VERITY_OPT_AT_MOST_ONCE "check_at_most_once" -#define DM_VERITY_OPTS_MAX (2 + DM_VERITY_OPTS_FEC) +#define DM_VERITY_OPTS_MAX (3 + DM_VERITY_OPTS_FEC) static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; @@ -471,13 +471,14 @@ static int verity_verify_io(struct dm_verity_io *io) struct bvec_iter start; unsigned b; struct crypto_wait wait; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); for (b = 0; b < io->n_blocks; b++) { int r; sector_t cur_block = io->block + b; struct ahash_request *req = verity_io_hash_req(v, io); - if (v->validated_blocks && + if (v->validated_blocks && bio->bi_status == BLK_STS_OK && likely(test_bit(cur_block, v->validated_blocks))) { verity_bv_skip_block(v, io, &io->iter); continue; @@ -525,15 +526,32 @@ static int verity_verify_io(struct dm_verity_io *io) else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, cur_block, NULL, &start) == 0) continue; - else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, - cur_block)) - return -EIO; + else { + if (bio->bi_status) { + /* + * Error correction failed; Just return error + */ + return -EIO; + } + if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA, + cur_block)) + return -EIO; + } } return 0; } /* + * Skip verity work in response to I/O error when system is shutting down. + */ +static inline bool verity_is_system_shutting_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + +/* * End one "io" structure with a given error. */ static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) @@ -560,7 +578,10 @@ static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_status && !verity_fec_is_enabled(io->v)) { + if (bio->bi_status && + (!verity_fec_is_enabled(io->v) || + verity_is_system_shutting_down() || + (bio->bi_opf & REQ_RAHEAD))) { verity_finish_io(io, bio->bi_status); return; } @@ -1166,6 +1187,7 @@ bad: static struct target_type verity_target = { .name = "verity", + .features = DM_TARGET_IMMUTABLE, .version = {1, 4, 0}, .module = THIS_MODULE, .ctr = verity_ctr, diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h index 3441c10b840c..6e65ec0e627a 100644 --- a/drivers/md/dm-verity.h +++ b/drivers/md/dm-verity.h @@ -109,12 +109,6 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v, return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size; } -static inline u8 *verity_io_digest_end(struct dm_verity *v, - struct dm_verity_io *io) -{ - return verity_io_want_digest(v, io) + v->digest_size; -} - extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io, struct bvec_iter *iter, int (*process)(struct dm_verity *v, diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 776aaf5951e4..af5af0b78022 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -20,7 +20,7 @@ #define HIGH_WATERMARK 50 #define LOW_WATERMARK 45 -#define MAX_WRITEBACK_JOBS 0 +#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages / 16) #define ENDIO_LATENCY 16 #define WRITEBACK_LATENCY 64 #define AUTOCOMMIT_BLOCKS_SSD 65536 @@ -142,6 +142,7 @@ struct dm_writecache { size_t metadata_sectors; size_t n_blocks; uint64_t seq_count; + sector_t data_device_sectors; void *block_start; struct wc_entry *entries; unsigned block_size; @@ -153,6 +154,7 @@ struct dm_writecache { bool overwrote_committed:1; bool memory_vmapped:1; + bool start_sector_set:1; bool high_wm_percent_set:1; bool low_wm_percent_set:1; bool max_writeback_jobs_set:1; @@ -161,6 +163,10 @@ struct dm_writecache { bool writeback_fua_set:1; bool flush_on_suspend:1; + unsigned high_wm_percent_value; + unsigned low_wm_percent_value; + unsigned autocommit_time_value; + unsigned writeback_all; struct workqueue_struct *writeback_wq; struct work_struct writeback_work; @@ -318,7 +324,7 @@ err1: #else static int persistent_memory_claim(struct dm_writecache *wc) { - BUG(); + return -EOPNOTSUPP; } #endif @@ -924,6 +930,8 @@ static void writecache_resume(struct dm_target *ti) wc_lock(wc); + wc->data_device_sectors = i_size_read(wc->dev->bdev->bd_inode) >> SECTOR_SHIFT; + if (WC_MODE_PMEM(wc)) { persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); } else { @@ -1494,6 +1502,10 @@ static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t void *address = memory_data(wc, e); persistent_memory_flush_cache(address, block_size); + + if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) + return true; + return bio_add_page(&wb->bio, persistent_memory_page(address), block_size, persistent_memory_page_offset(address)) != 0; } @@ -1566,6 +1578,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba if (writecache_has_error(wc)) { bio->bi_status = BLK_STS_IOERR; bio_endio(&wb->bio); + } else if (unlikely(!bio_sectors(&wb->bio))) { + bio->bi_status = BLK_STS_OK; + bio_endio(&wb->bio); } else { submit_bio(&wb->bio); } @@ -1609,6 +1624,14 @@ static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writebac e = f; } + if (unlikely(to.sector + to.count > wc->data_device_sectors)) { + if (to.sector >= wc->data_device_sectors) { + writecache_copy_endio(0, 0, c); + continue; + } + from.count = to.count = wc->data_device_sectors - to.sector; + } + dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); __writeback_throttle(wc, wbl); @@ -1883,7 +1906,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) struct wc_memory_superblock s; static struct dm_arg _args[] = { - {0, 10, "Invalid number of feature args"}, + {0, 16, "Invalid number of feature args"}, }; as.argc = argc; @@ -2045,6 +2068,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) goto invalid_optional; wc->start_sector = start_sector; + wc->start_sector_set = true; if (wc->start_sector != start_sector || wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) goto invalid_optional; @@ -2054,6 +2078,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; if (high_wm_percent < 0 || high_wm_percent > 100) goto invalid_optional; + wc->high_wm_percent_value = high_wm_percent; wc->high_wm_percent_set = true; } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { string = dm_shift_arg(&as), opt_params--; @@ -2061,6 +2086,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) goto invalid_optional; if (low_wm_percent < 0 || low_wm_percent > 100) goto invalid_optional; + wc->low_wm_percent_value = low_wm_percent; wc->low_wm_percent_set = true; } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { string = dm_shift_arg(&as), opt_params--; @@ -2080,6 +2106,7 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) if (autocommit_msecs > 3600000) goto invalid_optional; wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); + wc->autocommit_time_value = autocommit_msecs; wc->autocommit_time_set = true; } else if (!strcasecmp(string, "fua")) { if (WC_MODE_PMEM(wc)) { @@ -2275,7 +2302,6 @@ static void writecache_status(struct dm_target *ti, status_type_t type, struct dm_writecache *wc = ti->private; unsigned extra_args; unsigned sz = 0; - uint64_t x; switch (type) { case STATUSTYPE_INFO: @@ -2287,7 +2313,7 @@ static void writecache_status(struct dm_target *ti, status_type_t type, DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', wc->dev->name, wc->ssd_dev->name, wc->block_size); extra_args = 0; - if (wc->start_sector) + if (wc->start_sector_set) extra_args += 2; if (wc->high_wm_percent_set) extra_args += 2; @@ -2303,26 +2329,18 @@ static void writecache_status(struct dm_target *ti, status_type_t type, extra_args++; DMEMIT("%u", extra_args); - if (wc->start_sector) + if (wc->start_sector_set) DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); - if (wc->high_wm_percent_set) { - x = (uint64_t)wc->freelist_high_watermark * 100; - x += wc->n_blocks / 2; - do_div(x, (size_t)wc->n_blocks); - DMEMIT(" high_watermark %u", 100 - (unsigned)x); - } - if (wc->low_wm_percent_set) { - x = (uint64_t)wc->freelist_low_watermark * 100; - x += wc->n_blocks / 2; - do_div(x, (size_t)wc->n_blocks); - DMEMIT(" low_watermark %u", 100 - (unsigned)x); - } + if (wc->high_wm_percent_set) + DMEMIT(" high_watermark %u", wc->high_wm_percent_value); + if (wc->low_wm_percent_set) + DMEMIT(" low_watermark %u", wc->low_wm_percent_value); if (wc->max_writeback_jobs_set) DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); if (wc->autocommit_blocks_set) DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); if (wc->autocommit_time_set) - DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); + DMEMIT(" autocommit_time %u", wc->autocommit_time_value); if (wc->writeback_fua_set) DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); break; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 874bd542a744..9a9b2adcf39e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -146,6 +146,16 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; +#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE) +static int swap_bios = DEFAULT_SWAP_BIOS; +static int get_swap_bios(void) +{ + int latch = READ_ONCE(swap_bios); + if (unlikely(latch <= 0)) + latch = DEFAULT_SWAP_BIOS; + return latch; +} + /* * For mempools pre-allocation at the table loading time. */ @@ -269,7 +279,6 @@ out_free_rq_tio_cache: static void local_exit(void) { - flush_scheduled_work(); destroy_workqueue(deferred_remove_workqueue); kmem_cache_destroy(_rq_cache); @@ -462,7 +471,6 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx, struct block_device **bdev) - __acquires(md->io_barrier) { struct dm_target *tgt; struct dm_table *map; @@ -496,7 +504,6 @@ retry: } static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx) - __releases(md->io_barrier) { dm_put_live_table(md, srcu_idx); } @@ -517,7 +524,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, * subset of the parent bdev; require extra privileges. */ if (!capable(CAP_SYS_RAWIO)) { - DMWARN_LIMIT( + DMDEBUG_LIMIT( "%s: sending ioctl %x to DM device without required privilege.", current->comm, cmd); r = -ENOIOCTLCMD; @@ -623,21 +630,20 @@ static void start_io_acct(struct dm_io *io) false, 0, &io->stats_aux); } -static void end_io_acct(struct dm_io *io) +static void end_io_acct(struct mapped_device *md, struct bio *bio, + unsigned long start_time, struct dm_stats_aux *stats_aux) { - struct mapped_device *md = io->md; - struct bio *bio = io->orig_bio; - unsigned long duration = jiffies - io->start_time; + unsigned long duration = jiffies - start_time; int pending; int rw = bio_data_dir(bio); generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0, - io->start_time); + start_time); if (unlikely(dm_stats_used(&md->stats))) dm_stats_account_io(&md->stats, bio_data_dir(bio), bio->bi_iter.bi_sector, bio_sectors(bio), - true, duration, &io->stats_aux); + true, duration, stats_aux); /* * After this is decremented the bio must not be touched if it is @@ -864,6 +870,8 @@ static void dec_pending(struct dm_io *io, blk_status_t error) blk_status_t io_error; struct bio *bio; struct mapped_device *md = io->md; + unsigned long start_time = 0; + struct dm_stats_aux stats_aux; /* Push-back supersedes any I/O errors */ if (unlikely(error)) { @@ -890,8 +898,10 @@ static void dec_pending(struct dm_io *io, blk_status_t error) io_error = io->status; bio = io->orig_bio; - end_io_acct(io); + start_time = io->start_time; + stats_aux = io->stats_aux; free_io(md, io); + end_io_acct(md, bio, start_time, &stats_aux); if (io_error == BLK_STS_DM_REQUEUE) return; @@ -937,6 +947,11 @@ void disable_write_zeroes(struct mapped_device *md) limits->max_write_zeroes_sectors = 0; } +static bool swap_bios_limit(struct dm_target *ti, struct bio *bio) +{ + return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios); +} + static void clone_endio(struct bio *bio) { blk_status_t error = bio->bi_status; @@ -974,6 +989,11 @@ static void clone_endio(struct bio *bio) } } + if (unlikely(swap_bios_limit(tio->ti, bio))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } + free_tio(tio); dec_pending(io, error); } @@ -1252,6 +1272,22 @@ void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start) } EXPORT_SYMBOL_GPL(dm_remap_zone_report); +static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch) +{ + mutex_lock(&md->swap_bios_lock); + while (latch < md->swap_bios) { + cond_resched(); + down(&md->swap_bios_semaphore); + md->swap_bios--; + } + while (latch > md->swap_bios) { + cond_resched(); + up(&md->swap_bios_semaphore); + md->swap_bios++; + } + mutex_unlock(&md->swap_bios_lock); +} + static blk_qc_t __map_bio(struct dm_target_io *tio) { int r; @@ -1272,6 +1308,14 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) atomic_inc(&io->io_count); sector = clone->bi_iter.bi_sector; + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + int latch = get_swap_bios(); + if (unlikely(latch != md->swap_bios)) + __set_swap_bios_limit(md, latch); + down(&md->swap_bios_semaphore); + } + r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: @@ -1286,10 +1330,18 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) ret = generic_make_request(clone); break; case DM_MAPIO_KILL: + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } free_tio(tio); dec_pending(io, BLK_STS_IOERR); break; case DM_MAPIO_REQUEUE: + if (unlikely(swap_bios_limit(ti, clone))) { + struct mapped_device *md = io->md; + up(&md->swap_bios_semaphore); + } free_tio(tio); dec_pending(io, BLK_STS_DM_REQUEUE); break; @@ -1861,6 +1913,7 @@ static void cleanup_mapped_device(struct mapped_device *md) mutex_destroy(&md->suspend_lock); mutex_destroy(&md->type_lock); mutex_destroy(&md->table_devices_lock); + mutex_destroy(&md->swap_bios_lock); dm_mq_cleanup_mapped_device(md); } @@ -1935,6 +1988,10 @@ static struct mapped_device *alloc_dev(int minor) init_completion(&md->kobj_holder.completion); md->kworker_task = NULL; + md->swap_bios = get_swap_bios(); + sema_init(&md->swap_bios_semaphore, md->swap_bios); + mutex_init(&md->swap_bios_lock); + md->disk->major = _major; md->disk->first_minor = minor; md->disk->fops = &dm_blk_dops; @@ -1964,7 +2021,9 @@ static struct mapped_device *alloc_dev(int minor) bio_set_dev(&md->flush_bio, md->bdev); md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - dm_stats_init(&md->stats); + r = dm_stats_init(&md->stats); + if (r < 0) + goto bad; /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); @@ -2417,6 +2476,8 @@ static int dm_wait_for_completion(struct mapped_device *md, long task_state) } finish_wait(&md->wait, &wait); + smp_rmb(); /* paired with atomic_dec_return in end_io_acct */ + return r; } @@ -3062,6 +3123,11 @@ static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn, goto out; ti = dm_table_get_target(table, 0); + if (dm_suspended_md(md)) { + ret = -EAGAIN; + goto out; + } + ret = -EINVAL; if (!ti->type->iterate_devices) goto out; @@ -3230,6 +3296,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); +module_param(swap_bios, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index fd8607124bdb..7ca81e917aef 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -53,14 +53,7 @@ __acquires(bitmap->lock) { unsigned char *mappage; - if (page >= bitmap->pages) { - /* This can happen if bitmap_start_sync goes beyond - * End-of-device while looking for a whole page. - * It is harmless. - */ - return -EINVAL; - } - + WARN_ON_ONCE(page >= bitmap->pages); if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ return 0; @@ -488,7 +481,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap) sb = kmap_atomic(bitmap->storage.sb_page); pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); - pr_debug(" version: %d\n", le32_to_cpu(sb->version)); + pr_debug(" version: %u\n", le32_to_cpu(sb->version)); pr_debug(" uuid: %08x.%08x.%08x.%08x\n", le32_to_cpu(*(__u32 *)(sb->uuid+0)), le32_to_cpu(*(__u32 *)(sb->uuid+4)), @@ -499,11 +492,11 @@ void md_bitmap_print_sb(struct bitmap *bitmap) pr_debug("events cleared: %llu\n", (unsigned long long) le64_to_cpu(sb->events_cleared)); pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); - pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize)); - pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); + pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize)); + pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep)); pr_debug(" sync size: %llu KB\n", (unsigned long long)le64_to_cpu(sb->sync_size)/2); - pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind)); + pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind)); kunmap_atomic(sb); } @@ -641,14 +634,6 @@ re_read: daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; write_behind = le32_to_cpu(sb->write_behind); sectors_reserved = le32_to_cpu(sb->sectors_reserved); - /* Setup nodes/clustername only if bitmap version is - * cluster-compatible - */ - if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { - nodes = le32_to_cpu(sb->nodes); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, - sb->cluster_name, 64); - } /* verify that the bitmap-specific fields are valid */ if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) @@ -670,6 +655,16 @@ re_read: goto out; } + /* + * Setup nodes/clustername only if bitmap version is + * cluster-compatible + */ + if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { + nodes = le32_to_cpu(sb->nodes); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, + sb->cluster_name, 64); + } + /* keep the array size field of the bitmap superblock up to date */ sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); @@ -702,9 +697,9 @@ re_read: out: kunmap_atomic(sb); - /* Assigning chunksize is required for "re_read" */ - bitmap->mddev->bitmap_info.chunksize = chunksize; if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { + /* Assigning chunksize is required for "re_read" */ + bitmap->mddev->bitmap_info.chunksize = chunksize; err = md_setup_cluster(bitmap->mddev, nodes); if (err) { pr_warn("%s: Could not setup cluster service (%d)\n", @@ -715,18 +710,18 @@ out: goto re_read; } - out_no_sb: - if (test_bit(BITMAP_STALE, &bitmap->flags)) - bitmap->events_cleared = bitmap->mddev->events; - bitmap->mddev->bitmap_info.chunksize = chunksize; - bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; - bitmap->mddev->bitmap_info.max_write_behind = write_behind; - bitmap->mddev->bitmap_info.nodes = nodes; - if (bitmap->mddev->bitmap_info.space == 0 || - bitmap->mddev->bitmap_info.space > sectors_reserved) - bitmap->mddev->bitmap_info.space = sectors_reserved; - if (err) { + if (err == 0) { + if (test_bit(BITMAP_STALE, &bitmap->flags)) + bitmap->events_cleared = bitmap->mddev->events; + bitmap->mddev->bitmap_info.chunksize = chunksize; + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + bitmap->mddev->bitmap_info.nodes = nodes; + if (bitmap->mddev->bitmap_info.space == 0 || + bitmap->mddev->bitmap_info.space > sectors_reserved) + bitmap->mddev->bitmap_info.space = sectors_reserved; + } else { md_bitmap_print_sb(bitmap); if (bitmap->cluster_slot < 0) md_cluster_stop(bitmap->mddev); @@ -1366,12 +1361,20 @@ __acquires(bitmap->lock) sector_t csize; int err; + if (page >= bitmap->pages) { + /* + * This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page or + * user set a huge number to sysfs bitmap_set_bits. + */ + return NULL; + } err = md_bitmap_checkpage(bitmap, page, create, 0); if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) csize = ((sector_t)1) << (bitmap->chunkshift + - PAGE_COUNTER_SHIFT - 1); + PAGE_COUNTER_SHIFT); else csize = ((sector_t)1) << bitmap->chunkshift; *blocks = csize - (offset & (csize - 1)); @@ -1725,6 +1728,8 @@ void md_bitmap_flush(struct mddev *mddev) md_bitmap_daemon_work(mddev); bitmap->daemon_lastrun -= sleep; md_bitmap_daemon_work(mddev); + if (mddev->bitmap_info.external) + md_super_wait(mddev); md_bitmap_update_sb(bitmap); } @@ -2097,7 +2102,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, bytes = DIV_ROUND_UP(chunks, 8); if (!bitmap->mddev->bitmap_info.external) bytes += sizeof(bitmap_super_t); - } while (bytes > (space << 9)); + } while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) < + (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1)); } else chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; @@ -2142,7 +2148,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, bitmap->counts.missing_pages = pages; bitmap->counts.chunkshift = chunkshift; bitmap->counts.chunks = chunks; - bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + + bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift + BITMAP_BLOCK_SHIFT); blocks = min(old_counts.chunks << old_counts.chunkshift, @@ -2168,8 +2174,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, bitmap->counts.missing_pages = old_counts.pages; bitmap->counts.chunkshift = old_counts.chunkshift; bitmap->counts.chunks = old_counts.chunks; - bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + - BITMAP_BLOCK_SHIFT); + bitmap->mddev->bitmap_info.chunksize = + 1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT); blocks = old_counts.chunks << old_counts.chunkshift; pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); break; @@ -2187,20 +2193,23 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (set) { bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1); - if (*bmc_new == 0) { - /* need to set on-disk bits too. */ - sector_t end = block + new_blocks; - sector_t start = block >> chunkshift; - start <<= chunkshift; - while (start < end) { - md_bitmap_file_set_bit(bitmap, block); - start += 1 << chunkshift; + if (bmc_new) { + if (*bmc_new == 0) { + /* need to set on-disk bits too. */ + sector_t end = block + new_blocks; + sector_t start = block >> chunkshift; + + start <<= chunkshift; + while (start < end) { + md_bitmap_file_set_bit(bitmap, block); + start += 1 << chunkshift; + } + *bmc_new = 2; + md_bitmap_count_page(&bitmap->counts, block, 1); + md_bitmap_set_pending(&bitmap->counts, block); } - *bmc_new = 2; - md_bitmap_count_page(&bitmap->counts, block, 1); - md_bitmap_set_pending(&bitmap->counts, block); + *bmc_new |= NEEDED_MASK; } - *bmc_new |= NEEDED_MASK; if (new_blocks < old_blocks) old_blocks = new_blocks; } @@ -2492,6 +2501,9 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len) if (csize < 512 || !is_power_of_2(csize)) return -EINVAL; + if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE * + sizeof(((bitmap_super_t *)0)->chunksize)))) + return -EOVERFLOW; mddev->bitmap_info.chunksize = csize; return len; } diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 4522e87d9d68..107f36b9155f 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -669,9 +669,27 @@ out: * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. */ -static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) +static int lock_token(struct md_cluster_info *cinfo) { - int error, set_bit = 0; + int error; + + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", + __func__, __LINE__, error); + } else { + /* Lock the receive sequence */ + mutex_lock(&cinfo->recv_mutex); + } + return error; +} + +/* lock_comm() + * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. + */ +static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) +{ + int rv, set_bit = 0; struct mddev *mddev = cinfo->mddev; /* @@ -682,34 +700,19 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)) { - error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, + rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - WARN_ON_ONCE(error); + WARN_ON_ONCE(rv); md_wakeup_thread(mddev->thread); set_bit = 1; } - error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); - if (set_bit) - clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - if (error) - pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", - __func__, __LINE__, error); - - /* Lock the receive sequence */ - mutex_lock(&cinfo->recv_mutex); - return error; -} - -/* lock_comm() - * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. - */ -static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) -{ wait_event(cinfo->wait, !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); - - return lock_token(cinfo, mddev_locked); + rv = lock_token(cinfo); + if (set_bit) + clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + return rv; } static void unlock_comm(struct md_cluster_info *cinfo) @@ -789,9 +792,11 @@ static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg, { int ret; - lock_comm(cinfo, mddev_locked); - ret = __sendmsg(cinfo, cmsg); - unlock_comm(cinfo); + ret = lock_comm(cinfo, mddev_locked); + if (!ret) { + ret = __sendmsg(cinfo, cmsg); + unlock_comm(cinfo); + } return ret; } @@ -1063,7 +1068,7 @@ static int metadata_update_start(struct mddev *mddev) return 0; } - ret = lock_token(cinfo, 1); + ret = lock_token(cinfo); clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); return ret; } @@ -1181,7 +1186,10 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors) int raid_slot = -1; md_update_sb(mddev, 1); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) { + pr_err("%s: lock_comm failed\n", __func__); + return; + } memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(METADATA_UPDATED); @@ -1330,7 +1338,8 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) cmsg.type = cpu_to_le32(NEWDISK); memcpy(cmsg.uuid, uuid, 16); cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) + return -EAGAIN; ret = __sendmsg(cinfo, &cmsg); if (ret) { unlock_comm(cinfo); diff --git a/drivers/md/md.c b/drivers/md/md.c index a4e7e6c025d9..6b074c2202d5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -417,13 +417,14 @@ static void md_end_flush(struct bio *bio) struct md_rdev *rdev = bio->bi_private; struct mddev *mddev = rdev->mddev; + bio_put(bio); + rdev_dec_pending(rdev, mddev); if (atomic_dec_and_test(&mddev->flush_pending)) { /* The pre-request flush has finished */ queue_work(md_wq, &mddev->flush_work); } - bio_put(bio); } static void md_submit_flush_data(struct work_struct *ws); @@ -474,8 +475,10 @@ static void md_submit_flush_data(struct work_struct *ws) * could wait for this and below md_handle_request could wait for those * bios because of suspend check */ + spin_lock_irq(&mddev->lock); mddev->last_flush = mddev->start_flush; mddev->flush_bio = NULL; + spin_unlock_irq(&mddev->lock); wake_up(&mddev->sb_wait); if (bio->bi_iter.bi_size == 0) { @@ -581,8 +584,35 @@ void mddev_init(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init); +static struct mddev *mddev_find_locked(dev_t unit) +{ + struct mddev *mddev; + + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) + return mddev; + + return NULL; +} + static struct mddev *mddev_find(dev_t unit) { + struct mddev *mddev; + + if (MAJOR(unit) != MD_MAJOR) + unit &= ~((1 << MdpMinorShift) - 1); + + spin_lock(&all_mddevs_lock); + mddev = mddev_find_locked(unit); + if (mddev) + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + + return mddev; +} + +static struct mddev *mddev_find_or_alloc(dev_t unit) +{ struct mddev *mddev, *new = NULL; if (unit && MAJOR(unit) != MD_MAJOR) @@ -592,13 +622,13 @@ static struct mddev *mddev_find(dev_t unit) spin_lock(&all_mddevs_lock); if (unit) { - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == unit) { - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - kfree(new); - return mddev; - } + mddev = mddev_find_locked(unit); + if (mddev) { + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + kfree(new); + return mddev; + } if (new) { list_add(&new->all_mddevs, &all_mddevs); @@ -624,12 +654,7 @@ static struct mddev *mddev_find(dev_t unit) return NULL; } - is_free = 1; - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == dev) { - is_free = 0; - break; - } + is_free = !mddev_find_locked(dev); } new->unit = dev; new->md_minor = MINOR(dev); @@ -797,10 +822,12 @@ static void super_written(struct bio *bio) } else clear_bit(LastDev, &rdev->flags); + bio_put(bio); + + rdev_dec_pending(rdev, mddev); + if (atomic_dec_and_test(&mddev->pending_writes)) wake_up(&mddev->sb_wait); - rdev_dec_pending(rdev, mddev); - bio_put(bio); } void md_super_write(struct mddev *mddev, struct md_rdev *rdev, @@ -2419,14 +2446,16 @@ static void sync_sbs(struct mddev *mddev, int nospares) static bool does_sb_need_changing(struct mddev *mddev) { - struct md_rdev *rdev; + struct md_rdev *rdev = NULL, *iter; struct mdp_superblock_1 *sb; int role; /* Find a good rdev */ - rdev_for_each(rdev, mddev) - if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) + rdev_for_each(iter, mddev) + if ((iter->raid_disk >= 0) && !test_bit(Faulty, &iter->flags)) { + rdev = iter; break; + } /* No good device found. */ if (!rdev) @@ -2962,6 +2991,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) err = kstrtouint(buf, 10, (unsigned int *)&slot); if (err < 0) return err; + if (slot < 0) + /* overflow */ + return -ENOSPC; } if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also @@ -3639,8 +3671,9 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) static ssize_t safe_delay_show(struct mddev *mddev, char *page) { - int msec = (mddev->safemode_delay*1000)/HZ; - return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); + unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ; + + return sprintf(page, "%u.%03u\n", msec/1000, msec%1000); } static ssize_t safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) @@ -3652,7 +3685,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) return -EINVAL; } - if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) + if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ) return -EINVAL; if (msec == 0) mddev->safemode_delay = 0; @@ -4304,6 +4337,8 @@ max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len rv = kstrtouint(buf, 10, &n); if (rv < 0) return rv; + if (n > INT_MAX) + return -EINVAL; atomic_set(&mddev->max_corr_read_errors, n); return len; } @@ -4604,11 +4639,21 @@ action_store(struct mddev *mddev, const char *page, size_t len) return -EINVAL; err = mddev_lock(mddev); if (!err) { - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { err = -EBUSY; - else { + } else if (mddev->reshape_position == MaxSector || + mddev->pers->check_reshape == NULL || + mddev->pers->check_reshape(mddev)) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); err = mddev->pers->start_reshape(mddev); + } else { + /* + * If reshape is still in progress, and + * md_check_recovery() can continue to reshape, + * don't restart reshape because data can be + * corrupted for raid456. + */ + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } mddev_unlock(mddev); } @@ -5300,7 +5345,7 @@ static int md_alloc(dev_t dev, char *name) * writing to /sys/module/md_mod/parameters/new_array. */ static DEFINE_MUTEX(disks_mutex); - struct mddev *mddev = mddev_find(dev); + struct mddev *mddev = mddev_find_or_alloc(dev); struct gendisk *disk; int partitioned; int shift; @@ -5377,10 +5422,6 @@ static int md_alloc(dev_t dev, char *name) */ disk->flags |= GENHD_FL_EXT_DEVT; mddev->gendisk = disk; - /* As soon as we call add_disk(), another thread could get - * through to md_open, so make sure it doesn't get too far - */ - mutex_lock(&mddev->open_mutex); add_disk(disk); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); @@ -5395,7 +5436,6 @@ static int md_alloc(dev_t dev, char *name) if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) pr_debug("pointless warning\n"); - mutex_unlock(&mddev->open_mutex); abort: mutex_unlock(&disks_mutex); if (!error && mddev->kobj.sd) { @@ -5916,6 +5956,7 @@ void md_stop(struct mddev *mddev) /* stop the array and free an attached data structures. * This is called from dm-raid */ + __md_stop_writes(mddev); __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); @@ -6153,11 +6194,9 @@ static void autorun_devices(int part) md_probe(dev, NULL, NULL); mddev = mddev_find(dev); - if (!mddev || !mddev->gendisk) { - if (mddev) - mddev_put(mddev); + if (!mddev) break; - } + if (mddev_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version @@ -6564,8 +6603,10 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); + if (mddev_is_clustered(mddev)) { + if (md_cluster_ops->remove_disk(mddev, rdev)) + goto busy; + } md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -6895,6 +6936,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) return -EINVAL; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || mddev->reshape_position != MaxSector) return -EBUSY; @@ -7214,8 +7256,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto out; } - WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); - set_bit(MD_CLOSING, &mddev->flags); + if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto out; + } did_set_md_closing = true; mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); @@ -7453,8 +7498,7 @@ static int md_open(struct block_device *bdev, fmode_t mode) /* Wait until bdev->bd_disk is definitely gone */ if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); - /* Then retry the open from the top */ - return -ERESTARTSYS; + return -EBUSY; } BUG_ON(mddev != bdev->bd_disk->private_data); @@ -7596,17 +7640,22 @@ EXPORT_SYMBOL(md_register_thread); void md_unregister_thread(struct md_thread **threadp) { - struct md_thread *thread = *threadp; - if (!thread) - return; - pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); - /* Locking ensures that mddev_unlock does not wake_up a + struct md_thread *thread; + + /* + * Locking ensures that mddev_unlock does not wake_up a * non-existent thread */ spin_lock(&pers_lock); + thread = *threadp; + if (!thread) { + spin_unlock(&pers_lock); + return; + } *threadp = NULL; spin_unlock(&pers_lock); + pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); kfree(thread); } @@ -7787,7 +7836,11 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) loff_t l = *pos; struct mddev *mddev; - if (l >= 0x10000) + if (l == 0x10000) { + ++*pos; + return (void *)2; + } + if (l > 0x10000) return NULL; if (!l--) /* header */ @@ -8875,11 +8928,11 @@ void md_check_recovery(struct mddev *mddev) } if (mddev_is_clustered(mddev)) { - struct md_rdev *rdev; + struct md_rdev *rdev, *tmp; /* kick the device if another node issued a * remove disk. */ - rdev_for_each(rdev, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (test_and_clear_bit(ClusterRemove, &rdev->flags) && rdev->raid_disk < 0) md_kick_rdev_from_array(rdev); @@ -9179,7 +9232,7 @@ err_wq: static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - struct md_rdev *rdev2; + struct md_rdev *rdev2, *tmp; int role, ret; char b[BDEVNAME_SIZE]; @@ -9196,7 +9249,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } /* Check for change of roles in the active devices */ - rdev_for_each(rdev2, mddev) { + rdev_for_each_safe(rdev2, tmp, mddev) { if (test_bit(Faulty, &rdev2->flags)) continue; @@ -9238,8 +9291,11 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } } - if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) - update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) { + ret = update_raid_disks(mddev, le32_to_cpu(sb->raid_disks)); + if (ret) + pr_warn("md: updating array disks failed. %d\n", ret); + } /* Finally set the event to be up to date */ mddev->events = le64_to_cpu(sb->events); @@ -9294,16 +9350,18 @@ static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) void md_reload_sb(struct mddev *mddev, int nr) { - struct md_rdev *rdev; + struct md_rdev *rdev = NULL, *iter; int err; /* Find the rdev */ - rdev_for_each_rcu(rdev, mddev) { - if (rdev->desc_nr == nr) + rdev_for_each_rcu(iter, mddev) { + if (iter->desc_nr == nr) { + rdev = iter; break; + } } - if (!rdev || rdev->desc_nr != nr) { + if (!rdev) { pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); return; } diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 492a3f8ac119..0401daa0f7fb 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -494,7 +494,7 @@ int dm_bm_write_lock(struct dm_block_manager *bm, void *p; int r; - if (bm->read_only) + if (dm_bm_is_read_only(bm)) return -EPERM; p = dm_bufio_read(bm->bufio, b, (struct dm_buffer **) result); @@ -563,7 +563,7 @@ int dm_bm_write_lock_zero(struct dm_block_manager *bm, struct buffer_aux *aux; void *p; - if (bm->read_only) + if (dm_bm_is_read_only(bm)) return -EPERM; p = dm_bufio_new(bm->bufio, b, (struct dm_buffer **) result); @@ -603,7 +603,7 @@ EXPORT_SYMBOL_GPL(dm_bm_unlock); int dm_bm_flush(struct dm_block_manager *bm) { - if (bm->read_only) + if (dm_bm_is_read_only(bm)) return -EPERM; return dm_bufio_write_dirty_buffers(bm->bufio); @@ -617,19 +617,21 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b) bool dm_bm_is_read_only(struct dm_block_manager *bm) { - return bm->read_only; + return (bm ? bm->read_only : true); } EXPORT_SYMBOL_GPL(dm_bm_is_read_only); void dm_bm_set_read_only(struct dm_block_manager *bm) { - bm->read_only = true; + if (bm) + bm->read_only = true; } EXPORT_SYMBOL_GPL(dm_bm_set_read_only); void dm_bm_set_read_write(struct dm_block_manager *bm) { - bm->read_only = false; + if (bm) + bm->read_only = false; } EXPORT_SYMBOL_GPL(dm_bm_set_read_write); diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index a240990a7f33..5673f8eb5f88 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -34,12 +34,12 @@ struct node_header { __le32 max_entries; __le32 value_size; __le32 padding; -} __packed; +} __attribute__((packed, aligned(8))); struct btree_node { struct node_header header; __le64 keys[0]; -} __packed; +} __attribute__((packed, aligned(8))); /* diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index eff04fa23dfa..63f2baed3c8a 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -423,9 +423,9 @@ static int rebalance_children(struct shadow_spine *s, memcpy(n, dm_block_data(child), dm_bm_block_size(dm_tm_get_bm(info->tm))); - dm_tm_unlock(info->tm, child); dm_tm_dec(info->tm, dm_block_location(child)); + dm_tm_unlock(info->tm, child); return 0; } @@ -549,7 +549,8 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, delete_at(n, index); } - *new_root = shadow_root(&spine); + if (!r) + *new_root = shadow_root(&spine); exit_shadow_spine(&spine); return r; diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 8aae0624a297..6383afb88f31 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -83,14 +83,16 @@ void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, } static int insert_at(size_t value_size, struct btree_node *node, unsigned index, - uint64_t key, void *value) - __dm_written_to_disk(value) + uint64_t key, void *value) + __dm_written_to_disk(value) { uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); + uint32_t max_entries = le32_to_cpu(node->header.max_entries); __le64 key_le = cpu_to_le64(key); if (index > nr_entries || - index >= le32_to_cpu(node->header.max_entries)) { + index >= max_entries || + nr_entries >= max_entries) { DMERR("too many entries in btree node for insert"); __dm_unbless_for_disk(value); return -ENOMEM; diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 17aef55ed708..5115a2719603 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -279,6 +279,11 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result) struct disk_index_entry ie_disk; struct dm_block *blk; + if (b >= ll->nr_blocks) { + DMERR_LIMIT("metadata block out of bounds"); + return -EINVAL; + } + b = do_div(index, ll->entries_per_block); r = ll->load_ie(ll, index, &ie_disk); if (r < 0) @@ -337,6 +342,8 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, */ begin = do_div(index_begin, ll->entries_per_block); end = do_div(end, ll->entries_per_block); + if (end == 0) + end = ll->entries_per_block; for (i = index_begin; i < index_end; i++, begin = 0) { struct dm_block *blk; diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index 8de63ce39bdd..87e17909ef52 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -33,7 +33,7 @@ struct disk_index_entry { __le64 blocknr; __le32 nr_free; __le32 none_free_before; -} __packed; +} __attribute__ ((packed, aligned(8))); #define MAX_METADATA_BITMAPS 255 @@ -43,7 +43,7 @@ struct disk_metadata_index { __le64 blocknr; struct disk_index_entry index[MAX_METADATA_BITMAPS]; -} __packed; +} __attribute__ ((packed, aligned(8))); struct ll_disk; @@ -86,7 +86,7 @@ struct disk_sm_root { __le64 nr_allocated; __le64 bitmap_root; __le64 ref_count_root; -} __packed; +} __attribute__ ((packed, aligned(8))); #define ENTRIES_PER_BYTE 4 @@ -94,7 +94,7 @@ struct disk_bitmap_header { __le32 csum; __le32 not_used; __le64 blocknr; -} __packed; +} __attribute__ ((packed, aligned(8))); enum allocation_event { SM_NONE, diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index bf4c5e2ccb6f..e0acae7a3815 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -171,6 +171,14 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smd->begin and the end of the metadata device. + * We search before smd->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b); + } + if (r) return r; @@ -199,7 +207,6 @@ static int sm_disk_commit(struct dm_space_map *sm) return r; memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); - smd->begin = 0; smd->nr_allocated_this_transaction = 0; r = sm_disk_get_nr_free(sm, &nr_free); diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 9e3c64ec2026..da439ac85796 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -452,6 +452,14 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smm->begin and the end of the metadata device. + * We search before smm->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b); + } + if (r) return r; @@ -503,7 +511,6 @@ static int sm_metadata_commit(struct dm_space_map *sm) return r; memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); - smm->begin = 0; smm->allocated_this_transaction = 0; return 0; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 0272102b207e..6f5710e833c1 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -70,8 +70,8 @@ static void dump_zones(struct mddev *mddev) int len = 0; for (k = 0; k < conf->strip_zone[j].nb_dev; k++) - len += snprintf(line+len, 200-len, "%s%s", k?"/":"", - bdevname(conf->devlist[j*raid_disks + len += scnprintf(line+len, 200-len, "%s%s", k?"/":"", + bdevname(conf->devlist[j*raid_disks + k]->bdev, b)); pr_debug("md: zone%d=[%s]\n", j, line); @@ -150,21 +150,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) pr_debug("md/raid0:%s: FINAL %d zones\n", mdname(mddev), conf->nr_strip_zones); - if (conf->nr_strip_zones == 1) { - conf->layout = RAID0_ORIG_LAYOUT; - } else if (mddev->layout == RAID0_ORIG_LAYOUT || - mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { - conf->layout = mddev->layout; - } else if (default_layout == RAID0_ORIG_LAYOUT || - default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { - conf->layout = default_layout; - } else { - pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", - mdname(mddev)); - pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); - err = -ENOTSUPP; - goto abort; - } /* * now since we have the hard sector sizes, we can make sure * chunk size is a multiple of that sector size @@ -295,6 +280,34 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) (unsigned long long)smallest->sectors); } + if (conf->nr_strip_zones == 1 || conf->strip_zone[1].nb_dev == 1) { + conf->layout = RAID0_ORIG_LAYOUT; + } else if (mddev->layout == RAID0_ORIG_LAYOUT || + mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = mddev->layout; + } else if (default_layout == RAID0_ORIG_LAYOUT || + default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = default_layout; + } else { + pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", + mdname(mddev)); + pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); + err = -EOPNOTSUPP; + goto abort; + } + + if (conf->layout == RAID0_ORIG_LAYOUT) { + for (i = 1; i < conf->nr_strip_zones; i++) { + sector_t first_sector = conf->strip_zone[i-1].zone_end; + + sector_div(first_sector, mddev->chunk_sectors); + zone = conf->strip_zone + i; + /* disk_shift is first disk index used in the zone */ + zone->disk_shift = sector_div(first_sector, + zone->nb_dev); + } + } + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); *private_conf = conf; @@ -481,6 +494,20 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev, } } +/* + * Convert disk_index to the disk order in which it is read/written. + * For example, if we have 4 disks, they are numbered 0,1,2,3. If we + * write the disks starting at disk 3, then the read/write order would + * be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift() + * to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map + * to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in + * that 'output' space to understand the read/write disk ordering. + */ +static int map_disk_shift(int disk_index, int num_disks, int disk_shift) +{ + return ((disk_index + num_disks - disk_shift) % num_disks); +} + static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) { struct r0conf *conf = mddev->private; @@ -494,7 +521,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) sector_t end_disk_offset; unsigned int end_disk_index; unsigned int disk; + sector_t orig_start, orig_end; + orig_start = start; zone = find_zone(conf, &start); if (bio_end_sector(bio) > zone->zone_end) { @@ -508,6 +537,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) } else end = bio_end_sector(bio); + orig_end = end; if (zone != conf->strip_zone) end = end - zone[-1].zone_end; @@ -519,13 +549,26 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) last_stripe_index = end; sector_div(last_stripe_index, stripe_size); - start_disk_index = (int)(start - first_stripe_index * stripe_size) / - mddev->chunk_sectors; + /* In the first zone the original and alternate layouts are the same */ + if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) { + sector_div(orig_start, mddev->chunk_sectors); + start_disk_index = sector_div(orig_start, zone->nb_dev); + start_disk_index = map_disk_shift(start_disk_index, + zone->nb_dev, + zone->disk_shift); + sector_div(orig_end, mddev->chunk_sectors); + end_disk_index = sector_div(orig_end, zone->nb_dev); + end_disk_index = map_disk_shift(end_disk_index, + zone->nb_dev, zone->disk_shift); + } else { + start_disk_index = (int)(start - first_stripe_index * stripe_size) / + mddev->chunk_sectors; + end_disk_index = (int)(end - last_stripe_index * stripe_size) / + mddev->chunk_sectors; + } start_disk_offset = ((int)(start - first_stripe_index * stripe_size) % mddev->chunk_sectors) + first_stripe_index * mddev->chunk_sectors; - end_disk_index = (int)(end - last_stripe_index * stripe_size) / - mddev->chunk_sectors; end_disk_offset = ((int)(end - last_stripe_index * stripe_size) % mddev->chunk_sectors) + last_stripe_index * mddev->chunk_sectors; @@ -534,18 +577,22 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) sector_t dev_start, dev_end; struct bio *discard_bio = NULL; struct md_rdev *rdev; + int compare_disk; + + compare_disk = map_disk_shift(disk, zone->nb_dev, + zone->disk_shift); - if (disk < start_disk_index) + if (compare_disk < start_disk_index) dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > start_disk_index) + else if (compare_disk > start_disk_index) dev_start = first_stripe_index * mddev->chunk_sectors; else dev_start = start_disk_offset; - if (disk < end_disk_index) + if (compare_disk < end_disk_index) dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > end_disk_index) + else if (compare_disk > end_disk_index) dev_end = last_stripe_index * mddev->chunk_sectors; else dev_end = end_disk_offset; diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 3816e5477db1..8cc761ca7423 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h @@ -6,6 +6,7 @@ struct strip_zone { sector_t zone_end; /* Start of the next zone (in sectors) */ sector_t dev_start; /* Zone offset in real dev (in sectors) */ int nb_dev; /* # of devices attached to the zone */ + int disk_shift; /* start disk for the original layout */ }; /* Linux 3.14 (20d0189b101) made an unintended change to diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index abcb4c3a76c1..5ff06fbcfabf 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -445,6 +445,8 @@ static void raid1_end_write_request(struct bio *bio) if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); else { + /* Fail the request */ + set_bit(R1BIO_Degraded, &r1_bio->state); /* Finished with this branch */ r1_bio->bios[mirror] = NULL; to_put = bio; @@ -1783,6 +1785,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) int number = rdev->raid_disk; struct raid1_info *p = conf->mirrors + number; + if (unlikely(number >= conf->raid_disks)) + goto abort; + if (rdev != p->rdev) p = conf->mirrors + conf->raid_disks + number; @@ -3108,6 +3113,7 @@ static int raid1_run(struct mddev *mddev) * RAID1 needs at least one disk in active */ if (conf->raid_disks - mddev->degraded < 1) { + md_unregister_thread(&conf->thread); ret = -EINVAL; goto abort; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 02c5e390f89f..bee694be2013 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -781,8 +781,16 @@ static struct md_rdev *read_balance(struct r10conf *conf, disk = r10_bio->devs[slot].devnum; rdev = rcu_dereference(conf->mirrors[disk].replacement); if (rdev == NULL || test_bit(Faulty, &rdev->flags) || - r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) + r10_bio->devs[slot].addr + sectors > + rdev->recovery_offset) { + /* + * Read replacement first to prevent reading both rdev + * and replacement as NULL during replacement replace + * rdev. + */ + smp_mb(); rdev = rcu_dereference(conf->mirrors[disk].rdev); + } if (rdev == NULL || test_bit(Faulty, &rdev->flags)) continue; @@ -934,6 +942,7 @@ static void flush_pending_writes(struct r10conf *conf) else generic_make_request(bio); bio = next; + cond_resched(); } blk_finish_plug(&plug); } else @@ -1119,6 +1128,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) else generic_make_request(bio); bio = next; + cond_resched(); } kfree(plug); } @@ -1138,7 +1148,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - if (r10_bio->devs[slot].rdev) { + if (slot >= 0 && r10_bio->devs[slot].rdev) { /* * This is an error retry, but we cannot * safely dereference the rdev in the r10_bio, @@ -1400,9 +1410,15 @@ retry_write: for (i = 0; i < conf->copies; i++) { int d = r10_bio->devs[i].devnum; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[d].replacement); + struct md_rdev *rdev, *rrdev; + + rrdev = rcu_dereference(conf->mirrors[d].replacement); + /* + * Read replacement first to prevent reading both rdev and + * replacement as NULL during replacement replace rdev. + */ + smp_mb(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev == rrdev) rrdev = NULL; if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { @@ -1547,6 +1563,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; + r10_bio->read_slot = -1; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); if (bio_data_dir(bio) == READ) @@ -1862,9 +1879,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) int err = 0; int number = rdev->raid_disk; struct md_rdev **rdevp; - struct raid10_info *p = conf->mirrors + number; + struct raid10_info *p; print_conf(conf); + if (unlikely(number >= mddev->raid_disks)) + return 0; + p = conf->mirrors + number; if (rdev == p->rdev) rdevp = &p->rdev; else if (rdev == p->replacement) @@ -2262,11 +2282,22 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int d; - struct bio *wbio, *wbio2; + struct bio *wbio = r10_bio->devs[1].bio; + struct bio *wbio2 = r10_bio->devs[1].repl_bio; + + /* Need to test wbio2->bi_end_io before we call + * generic_make_request as if the former is NULL, + * the latter is free to free wbio2. + */ + if (wbio2 && !wbio2->bi_end_io) + wbio2 = NULL; if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { fix_recovery_read_error(r10_bio); - end_sync_request(r10_bio); + if (wbio->bi_end_io) + end_sync_request(r10_bio); + if (wbio2) + end_sync_request(r10_bio); return; } @@ -2275,14 +2306,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) * and submit the write request */ d = r10_bio->devs[1].devnum; - wbio = r10_bio->devs[1].bio; - wbio2 = r10_bio->devs[1].repl_bio; - /* Need to test wbio2->bi_end_io before we call - * generic_make_request as if the former is NULL, - * the latter is free to free wbio2. - */ - if (wbio2 && !wbio2->bi_end_io) - wbio2 = NULL; if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); @@ -2950,10 +2973,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, sector_t chunk_mask = conf->geo.chunk_mask; int page_idx = 0; - if (!mempool_initialized(&conf->r10buf_pool)) - if (init_resync(conf)) - return 0; - /* * Allow skipping a full rebuild for incremental assembly * of a clean array, like RAID1 does. @@ -2969,6 +2988,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, return mddev->dev_sectors - sector_nr; } + if (!mempool_initialized(&conf->r10buf_pool)) + if (init_resync(conf)) + return 0; + skipped: max_sector = mddev->dev_sectors; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -3664,6 +3687,20 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) return nc*fc; } +static void raid10_free_conf(struct r10conf *conf) +{ + if (!conf) + return; + + mempool_exit(&conf->r10bio_pool); + kfree(conf->mirrors); + kfree(conf->mirrors_old); + kfree(conf->mirrors_new); + safe_put_page(conf->tmppage); + bioset_exit(&conf->bio_split); + kfree(conf); +} + static struct r10conf *setup_conf(struct mddev *mddev) { struct r10conf *conf = NULL; @@ -3746,20 +3783,24 @@ static struct r10conf *setup_conf(struct mddev *mddev) return conf; out: - if (conf) { - mempool_exit(&conf->r10bio_pool); - kfree(conf->mirrors); - safe_put_page(conf->tmppage); - bioset_exit(&conf->bio_split); - kfree(conf); - } + raid10_free_conf(conf); return ERR_PTR(err); } +static void raid10_set_io_opt(struct r10conf *conf) +{ + int raid_disks = conf->geo.raid_disks; + + if (!(conf->geo.raid_disks % conf->geo.near_copies)) + raid_disks /= conf->geo.near_copies; + blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * + raid_disks); +} + static int raid10_run(struct mddev *mddev) { struct r10conf *conf; - int i, disk_idx, chunk_size; + int i, disk_idx; struct raid10_info *disk; struct md_rdev *rdev; sector_t size; @@ -3780,6 +3821,9 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; + mddev->thread = conf->thread; + conf->thread = NULL; + if (mddev_is_clustered(conf->mddev)) { int fc, fo; @@ -3792,21 +3836,13 @@ static int raid10_run(struct mddev *mddev) } } - mddev->thread = conf->thread; - conf->thread = NULL; - - chunk_size = mddev->chunk_sectors << 9; if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); - blk_queue_io_min(mddev->queue, chunk_size); - if (conf->geo.raid_disks % conf->geo.near_copies) - blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); - else - blk_queue_io_opt(mddev->queue, chunk_size * - (conf->geo.raid_disks / conf->geo.near_copies)); + blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); + raid10_set_io_opt(conf); } rdev_for_each(rdev, mddev) { @@ -3966,10 +4002,7 @@ static int raid10_run(struct mddev *mddev) out_free_conf: md_unregister_thread(&mddev->thread); - mempool_exit(&conf->r10bio_pool); - safe_put_page(conf->tmppage); - kfree(conf->mirrors); - kfree(conf); + raid10_free_conf(conf); mddev->private = NULL; out: return -EIO; @@ -3977,15 +4010,7 @@ out: static void raid10_free(struct mddev *mddev, void *priv) { - struct r10conf *conf = priv; - - mempool_exit(&conf->r10bio_pool); - safe_put_page(conf->tmppage); - kfree(conf->mirrors); - kfree(conf->mirrors_old); - kfree(conf->mirrors_new); - bioset_exit(&conf->bio_split); - kfree(conf); + raid10_free_conf(priv); } static void raid10_quiesce(struct mddev *mddev, int quiesce) @@ -4720,6 +4745,7 @@ static void end_reshape(struct r10conf *conf) stripe /= conf->geo.near_copies; if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; + raid10_set_io_opt(conf); } conf->fullsync = 0; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d91154d65455..b98abe927d06 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -44,6 +44,7 @@ */ #include <linux/blkdev.h> +#include <linux/delay.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> @@ -2417,8 +2418,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) } else err = -ENOMEM; - mutex_unlock(&conf->cache_size_mutex); - conf->slab_cache = sc; conf->active_name = 1-conf->active_name; @@ -2441,6 +2440,8 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!err) conf->pool_size = newsize; + mutex_unlock(&conf->cache_size_mutex); + return err; } @@ -2602,7 +2603,7 @@ static void raid5_end_write_request(struct bio *bi) struct stripe_head *sh = bi->bi_private; struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; - struct md_rdev *uninitialized_var(rdev); + struct md_rdev *rdev; sector_t first_bad; int bad_sectors; int replacement = 0; @@ -2670,10 +2671,10 @@ static void raid5_end_write_request(struct bio *bi) if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); - raid5_release_stripe(sh); if (sh->batch_head && sh != sh->batch_head) raid5_release_stripe(sh->batch_head); + raid5_release_stripe(sh); } static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) @@ -3723,7 +3724,7 @@ static void handle_stripe_fill(struct stripe_head *sh, * back cache (prexor with orig_page, and then xor with * page) in the read path */ - if (s->injournal && s->failed) { + if (s->to_read && s->injournal && s->failed) { if (test_bit(STRIPE_R5C_CACHING, &sh->state)) r5c_make_stripe_write_out(sh); goto out; @@ -6329,7 +6330,18 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); + + /* + * Waiting on MD_SB_CHANGE_PENDING below may deadlock + * seeing md_check_recovery() is needed to clear + * the flag when using mdmon. + */ + continue; } + + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); } pr_debug("%d stripes handled\n", handled); @@ -7141,6 +7153,12 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded return 0; } +static void raid5_set_io_opt(struct r5conf *conf) +{ + blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) * + (conf->raid_disks - conf->max_degraded)); +} + static int raid5_run(struct mddev *mddev) { struct r5conf *conf; @@ -7430,8 +7448,7 @@ static int raid5_run(struct mddev *mddev) chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); - blk_queue_io_opt(mddev->queue, chunk_size * - (conf->raid_disks - conf->max_degraded)); + raid5_set_io_opt(conf); mddev->queue->limits.raid_partial_stripes_expensive = 1; /* * We can only discard a whole stripe. It doesn't make sense to @@ -7716,6 +7733,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) */ if (rdev->saved_raid_disk >= 0 && rdev->saved_raid_disk >= first && + rdev->saved_raid_disk <= last && conf->disks[rdev->saved_raid_disk].rdev == NULL) first = rdev->saved_raid_disk; @@ -8024,6 +8042,7 @@ static void end_reshape(struct r5conf *conf) / PAGE_SIZE); if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; + raid5_set_io_opt(conf); } } } |