diff options
Diffstat (limited to 'drivers/md')
33 files changed, 453 insertions, 370 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index e4a3f692057b..a3763d664a67 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -548,6 +548,7 @@ struct cache_set { */ wait_queue_head_t btree_cache_wait; struct task_struct *btree_cache_alloc_lock; + spinlock_t btree_cannibalize_lock; /* * When we free a btree node, we increment the gen of the bucket the diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 1edf9515345e..03fb06c61e1c 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -319,7 +319,7 @@ int bch_btree_keys_alloc(struct btree_keys *b, unsigned page_order, gfp_t gfp) b->page_order = page_order; - t->data = (void *) __get_free_pages(gfp, b->page_order); + t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order); if (!t->data) goto err; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 96a6583e7b52..fba0fff8040d 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -794,7 +794,7 @@ int bch_btree_cache_alloc(struct cache_set *c) mutex_init(&c->verify_lock); c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(bucket_pages(c))); c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); @@ -840,15 +840,17 @@ out: static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) { - struct task_struct *old; - - old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); - if (old && old != current) { + spin_lock(&c->btree_cannibalize_lock); + if (likely(c->btree_cache_alloc_lock == NULL)) { + c->btree_cache_alloc_lock = current; + } else if (c->btree_cache_alloc_lock != current) { if (op) prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&c->btree_cannibalize_lock); return -EINTR; } + spin_unlock(&c->btree_cannibalize_lock); return 0; } @@ -883,10 +885,12 @@ static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, */ static void bch_cannibalize_unlock(struct cache_set *c) { + spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { c->btree_cache_alloc_lock = NULL; wake_up(&c->btree_cache_wait); } + spin_unlock(&c->btree_cannibalize_lock); } static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, @@ -1374,7 +1378,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (__set_blocks(n1, n1->keys + n2->keys, block_bytes(b->c)) > btree_blocks(new_nodes[i])) - goto out_nocoalesce; + goto out_unlock_nocoalesce; keys = n2->keys; /* Take the key of the node we're getting rid of */ @@ -1403,7 +1407,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (__bch_keylist_realloc(&keylist, bkey_u64s(&new_nodes[i]->key))) - goto out_nocoalesce; + goto out_unlock_nocoalesce; bch_btree_node_write(new_nodes[i], &cl); bch_keylist_add(&keylist, &new_nodes[i]->key); @@ -1449,6 +1453,10 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, /* Invalidated our iterator */ return -EINTR; +out_unlock_nocoalesce: + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + out_nocoalesce: closure_sync(&cl); bch_keylist_free(&keylist); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 6394be5ee9a8..6aafda26903c 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -838,8 +838,8 @@ int bch_journal_alloc(struct cache_set *c) j->w[1].c = c; if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || - !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) return -ENOMEM; return 0; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 690aeb09bbf5..6bf1559a1f0d 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1468,7 +1468,7 @@ void bch_cache_set_unregister(struct cache_set *c) } #define alloc_bucket_pages(gfp, c) \ - ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) + ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c)))) struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { @@ -1510,6 +1510,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) sema_init(&c->sb_write_mutex, 1); mutex_init(&c->bucket_lock); init_waitqueue_head(&c->btree_cache_wait); + spin_lock_init(&c->btree_cannibalize_lock); init_waitqueue_head(&c->bucket_wait); init_waitqueue_head(&c->gc_wait); sema_init(&c->uuid_write_mutex, 1); @@ -1780,7 +1781,14 @@ found: sysfs_create_link(&c->kobj, &ca->kobj, buf)) goto err; - if (ca->sb.seq > c->sb.seq) { + /* + * A special case is both ca->sb.seq and c->sb.seq are 0, + * such condition happens on a new created cache device whose + * super block is never flushed yet. In this case c->sb.version + * and other members should be updated too, otherwise we will + * have a mistaken super block version in cache set. + */ + if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) { c->sb.version = ca->sb.version; memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); c->sb.flags = ca->sb.flags; diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 7eb76a1a2505..521c13f7929c 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1369,7 +1369,7 @@ __acquires(bitmap->lock) if (bitmap->bp[page].hijacked || bitmap->bp[page].map == NULL) csize = ((sector_t)1) << (bitmap->chunkshift + - PAGE_COUNTER_SHIFT - 1); + PAGE_COUNTER_SHIFT); else csize = ((sector_t)1) << bitmap->chunkshift; *blocks = csize - (offset & (csize - 1)); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 394e53afc259..0acd10d3b7bf 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -536,12 +536,16 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd, CACHE_MAX_CONCURRENT_LOCKS); if (IS_ERR(cmd->bm)) { DMERR("could not create block manager"); - return PTR_ERR(cmd->bm); + r = PTR_ERR(cmd->bm); + cmd->bm = NULL; + return r; } r = __open_or_format_metadata(cmd, may_format_device); - if (r) + if (r) { dm_block_manager_destroy(cmd->bm); + cmd->bm = NULL; + } return r; } diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 69cdb29ef6be..280873b13e74 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -410,7 +410,6 @@ struct cache { spinlock_t lock; struct list_head deferred_cells; struct bio_list deferred_bios; - struct bio_list deferred_writethrough_bios; sector_t migration_threshold; wait_queue_head_t migration_wait; atomic_t nr_allocated_migrations; @@ -446,10 +445,10 @@ struct cache { struct dm_kcopyd_client *copier; struct workqueue_struct *wq; struct work_struct deferred_bio_worker; - struct work_struct deferred_writethrough_worker; struct work_struct migration_worker; struct delayed_work waker; struct dm_bio_prison_v2 *prison; + struct bio_set *bs; mempool_t *migration_pool; @@ -490,15 +489,6 @@ struct per_bio_data { struct dm_bio_prison_cell_v2 *cell; struct dm_hook_info hook_info; sector_t len; - - /* - * writethrough fields. These MUST remain at the end of this - * structure and the 'cache' member must be the first as it - * is used to determine the offset of the writethrough fields. - */ - struct cache *cache; - dm_cblock_t cblock; - struct dm_bio_details bio_details; }; struct dm_cache_migration { @@ -515,19 +505,19 @@ struct dm_cache_migration { /*----------------------------------------------------------------*/ -static bool writethrough_mode(struct cache_features *f) +static bool writethrough_mode(struct cache *cache) { - return f->io_mode == CM_IO_WRITETHROUGH; + return cache->features.io_mode == CM_IO_WRITETHROUGH; } -static bool writeback_mode(struct cache_features *f) +static bool writeback_mode(struct cache *cache) { - return f->io_mode == CM_IO_WRITEBACK; + return cache->features.io_mode == CM_IO_WRITEBACK; } -static inline bool passthrough_mode(struct cache_features *f) +static inline bool passthrough_mode(struct cache *cache) { - return unlikely(f->io_mode == CM_IO_PASSTHROUGH); + return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH); } /*----------------------------------------------------------------*/ @@ -537,14 +527,9 @@ static void wake_deferred_bio_worker(struct cache *cache) queue_work(cache->wq, &cache->deferred_bio_worker); } -static void wake_deferred_writethrough_worker(struct cache *cache) -{ - queue_work(cache->wq, &cache->deferred_writethrough_worker); -} - static void wake_migration_worker(struct cache *cache) { - if (passthrough_mode(&cache->features)) + if (passthrough_mode(cache)) return; queue_work(cache->wq, &cache->migration_worker); @@ -618,15 +603,9 @@ static unsigned lock_level(struct bio *bio) * Per bio data *--------------------------------------------------------------*/ -/* - * If using writeback, leave out struct per_bio_data's writethrough fields. - */ -#define PB_DATA_SIZE_WB (offsetof(struct per_bio_data, cache)) -#define PB_DATA_SIZE_WT (sizeof(struct per_bio_data)) - static size_t get_per_bio_data_size(struct cache *cache) { - return writethrough_mode(&cache->features) ? PB_DATA_SIZE_WT : PB_DATA_SIZE_WB; + return sizeof(struct per_bio_data); } static struct per_bio_data *get_per_bio_data(struct bio *bio, size_t data_size) @@ -868,16 +847,23 @@ static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio) spin_unlock_irqrestore(&cache->lock, flags); } -static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, - dm_oblock_t oblock) +static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, bool bio_has_pbd) { - // FIXME: this is called way too much. - check_if_tick_bio_needed(cache, bio); + if (bio_has_pbd) + check_if_tick_bio_needed(cache, bio); remap_to_origin(cache, bio); if (bio_data_dir(bio) == WRITE) clear_discard(cache, oblock_to_dblock(cache, oblock)); } +static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio, + dm_oblock_t oblock) +{ + // FIXME: check_if_tick_bio_needed() is called way too much through this interface + __remap_to_origin_clear_discard(cache, bio, oblock, true); +} + static void remap_to_cache_dirty(struct cache *cache, struct bio *bio, dm_oblock_t oblock, dm_cblock_t cblock) { @@ -937,57 +923,26 @@ static void issue_op(struct bio *bio, void *context) accounted_request(cache, bio); } -static void defer_writethrough_bio(struct cache *cache, struct bio *bio) -{ - unsigned long flags; - - spin_lock_irqsave(&cache->lock, flags); - bio_list_add(&cache->deferred_writethrough_bios, bio); - spin_unlock_irqrestore(&cache->lock, flags); - - wake_deferred_writethrough_worker(cache); -} - -static void writethrough_endio(struct bio *bio) -{ - struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); - - dm_unhook_bio(&pb->hook_info, bio); - - if (bio->bi_status) { - bio_endio(bio); - return; - } - - dm_bio_restore(&pb->bio_details, bio); - remap_to_cache(pb->cache, bio, pb->cblock); - - /* - * We can't issue this bio directly, since we're in interrupt - * context. So it gets put on a bio list for processing by the - * worker thread. - */ - defer_writethrough_bio(pb->cache, bio); -} - /* - * FIXME: send in parallel, huge latency as is. * When running in writethrough mode we need to send writes to clean blocks - * to both the cache and origin devices. In future we'd like to clone the - * bio and send them in parallel, but for now we're doing them in - * series as this is easier. + * to both the cache and origin devices. Clone the bio and send them in parallel. */ -static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, - dm_oblock_t oblock, dm_cblock_t cblock) +static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio, + dm_oblock_t oblock, dm_cblock_t cblock) { - struct per_bio_data *pb = get_per_bio_data(bio, PB_DATA_SIZE_WT); + struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, cache->bs); - pb->cache = cache; - pb->cblock = cblock; - dm_hook_bio(&pb->hook_info, bio, writethrough_endio, NULL); - dm_bio_record(&pb->bio_details, bio); + BUG_ON(!origin_bio); - remap_to_origin_clear_discard(pb->cache, bio, oblock); + bio_chain(origin_bio, bio); + /* + * Passing false to __remap_to_origin_clear_discard() skips + * all code that might use per_bio_data (since clone doesn't have it) + */ + __remap_to_origin_clear_discard(cache, origin_bio, oblock, false); + submit_bio(origin_bio); + + remap_to_cache(cache, bio, cblock); } /*---------------------------------------------------------------- @@ -1209,7 +1164,7 @@ static bool bio_writes_complete_block(struct cache *cache, struct bio *bio) static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block) { - return writeback_mode(&cache->features) && + return writeback_mode(cache) && (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio)); } @@ -1862,7 +1817,7 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, * Passthrough always maps to the origin, invalidating any * cache blocks that are written to. */ - if (passthrough_mode(&cache->features)) { + if (passthrough_mode(cache)) { if (bio_data_dir(bio) == WRITE) { bio_drop_shared_lock(cache, bio); atomic_inc(&cache->stats.demotion); @@ -1871,9 +1826,9 @@ static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block, remap_to_origin_clear_discard(cache, bio, block); } else { - if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) && + if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) && !is_dirty(cache, cblock)) { - remap_to_origin_then_cache(cache, bio, block, cblock); + remap_to_origin_and_cache(cache, bio, block, cblock); accounted_begin(cache, bio); } else remap_to_cache_dirty(cache, bio, block, cblock); @@ -2003,28 +1958,6 @@ static void process_deferred_bios(struct work_struct *ws) schedule_commit(&cache->committer); } -static void process_deferred_writethrough_bios(struct work_struct *ws) -{ - struct cache *cache = container_of(ws, struct cache, deferred_writethrough_worker); - - unsigned long flags; - struct bio_list bios; - struct bio *bio; - - bio_list_init(&bios); - - spin_lock_irqsave(&cache->lock, flags); - bio_list_merge(&bios, &cache->deferred_writethrough_bios); - bio_list_init(&cache->deferred_writethrough_bios); - spin_unlock_irqrestore(&cache->lock, flags); - - /* - * These bios have already been through accounted_begin() - */ - while ((bio = bio_list_pop(&bios))) - generic_make_request(bio); -} - /*---------------------------------------------------------------- * Main worker loop *--------------------------------------------------------------*/ @@ -2132,6 +2065,9 @@ static void destroy(struct cache *cache) kfree(cache->ctr_args[i]); kfree(cache->ctr_args); + if (cache->bs) + bioset_free(cache->bs); + kfree(cache); } @@ -2589,6 +2525,13 @@ static int cache_create(struct cache_args *ca, struct cache **result) cache->features = ca->features; ti->per_io_data_size = get_per_bio_data_size(cache); + if (writethrough_mode(cache)) { + /* Create bioset for writethrough bios issued to origin */ + cache->bs = bioset_create(BIO_POOL_SIZE, 0, 0); + if (!cache->bs) + goto bad; + } + cache->callbacks.congested_fn = cache_is_congested; dm_table_add_target_callbacks(ti->table, &cache->callbacks); @@ -2649,7 +2592,7 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - if (passthrough_mode(&cache->features)) { + if (passthrough_mode(cache)) { bool all_clean; r = dm_cache_metadata_all_clean(cache->cmd, &all_clean); @@ -2670,7 +2613,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->deferred_cells); bio_list_init(&cache->deferred_bios); - bio_list_init(&cache->deferred_writethrough_bios); atomic_set(&cache->nr_allocated_migrations, 0); atomic_set(&cache->nr_io_migrations, 0); init_waitqueue_head(&cache->migration_wait); @@ -2709,8 +2651,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios); - INIT_WORK(&cache->deferred_writethrough_worker, - process_deferred_writethrough_bios); INIT_WORK(&cache->migration_worker, check_migrations); INIT_DELAYED_WORK(&cache->waker, do_waker); @@ -3279,13 +3219,13 @@ static void cache_status(struct dm_target *ti, status_type_t type, else DMEMIT("1 "); - if (writethrough_mode(&cache->features)) + if (writethrough_mode(cache)) DMEMIT("writethrough "); - else if (passthrough_mode(&cache->features)) + else if (passthrough_mode(cache)) DMEMIT("passthrough "); - else if (writeback_mode(&cache->features)) + else if (writeback_mode(cache)) DMEMIT("writeback "); else { @@ -3451,7 +3391,7 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun unsigned i; struct cblock_range range; - if (!passthrough_mode(&cache->features)) { + if (!passthrough_mode(cache)) { DMERR("%s: cache has to be in passthrough mode for invalidation", cache_device_name(cache)); return -EPERM; diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d9a67759fdb5..b8a9695af141 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2188,7 +2188,12 @@ static void *crypt_page_alloc(gfp_t gfp_mask, void *pool_data) struct crypt_config *cc = pool_data; struct page *page; - if (unlikely(percpu_counter_compare(&cc->n_allocated_pages, dm_crypt_pages_per_client) >= 0) && + /* + * Note, percpu_counter_read_positive() may over (and under) estimate + * the current usage by at most (batch - 1) * num_online_cpus() pages, + * but avoids potential spinlock contention of an exact result. + */ + if (unlikely(percpu_counter_read_positive(&cc->n_allocated_pages) >= dm_crypt_pages_per_client) && likely(gfp_mask & __GFP_NORETRY)) return NULL; @@ -3088,7 +3093,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_segment_size = PAGE_SIZE; limits->logical_block_size = - max_t(unsigned short, limits->logical_block_size, cc->sector_size); + max_t(unsigned, limits->logical_block_size, cc->sector_size); limits->physical_block_size = max_t(unsigned, limits->physical_block_size, cc->sector_size); limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index ba84b8d62cd0..adf22692566d 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -46,6 +46,7 @@ struct writeset { static void writeset_free(struct writeset *ws) { vfree(ws->bits); + ws->bits = NULL; } static int setup_on_disk_bitset(struct dm_disk_bitset *info, @@ -70,8 +71,6 @@ static size_t bitset_size(unsigned nr_bits) */ static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) { - ws->md.nr_bits = nr_blocks; - ws->md.root = INVALID_WRITESET_ROOT; ws->bits = vzalloc(bitset_size(nr_blocks)); if (!ws->bits) { DMERR("%s: couldn't allocate in memory bitset", __func__); @@ -84,12 +83,14 @@ static int writeset_alloc(struct writeset *ws, dm_block_t nr_blocks) /* * Wipes the in-core bitset, and creates a new on disk bitset. */ -static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws) +static int writeset_init(struct dm_disk_bitset *info, struct writeset *ws, + dm_block_t nr_blocks) { int r; - memset(ws->bits, 0, bitset_size(ws->md.nr_bits)); + memset(ws->bits, 0, bitset_size(nr_blocks)); + ws->md.nr_bits = nr_blocks; r = setup_on_disk_bitset(info, ws->md.nr_bits, &ws->md.root); if (r) { DMERR("%s: setup_on_disk_bitset failed", __func__); @@ -133,7 +134,7 @@ static int writeset_test_and_set(struct dm_disk_bitset *info, { int r; - if (!test_and_set_bit(block, ws->bits)) { + if (!test_bit(block, ws->bits)) { r = dm_bitset_set_bit(info, ws->md.root, block, &ws->md.root); if (r) { /* FIXME: fail mode */ @@ -387,7 +388,7 @@ static void ws_dec(void *context, const void *value) static int ws_eq(void *context, const void *value1, const void *value2) { - return !memcmp(value1, value2, sizeof(struct writeset_metadata)); + return !memcmp(value1, value2, sizeof(struct writeset_disk)); } /*----------------------------------------------------------------*/ @@ -563,6 +564,15 @@ static int open_metadata(struct era_metadata *md) } disk = dm_block_data(sblock); + + /* Verify the data block size hasn't changed */ + if (le32_to_cpu(disk->data_block_size) != md->block_size) { + DMERR("changing the data block size (from %u to %llu) is not supported", + le32_to_cpu(disk->data_block_size), md->block_size); + r = -EINVAL; + goto bad; + } + r = dm_tm_open_with_sm(md->bm, SUPERBLOCK_LOCATION, disk->metadata_space_map_root, sizeof(disk->metadata_space_map_root), @@ -574,10 +584,10 @@ static int open_metadata(struct era_metadata *md) setup_infos(md); - md->block_size = le32_to_cpu(disk->data_block_size); md->nr_blocks = le32_to_cpu(disk->nr_blocks); md->current_era = le32_to_cpu(disk->current_era); + ws_unpack(&disk->current_writeset, &md->current_writeset->md); md->writeset_tree_root = le64_to_cpu(disk->writeset_tree_root); md->era_array_root = le64_to_cpu(disk->era_array_root); md->metadata_snap = le64_to_cpu(disk->metadata_snap); @@ -745,6 +755,12 @@ static int metadata_digest_lookup_writeset(struct era_metadata *md, ws_unpack(&disk, &d->writeset); d->value = cpu_to_le32(key); + /* + * We initialise another bitset info to avoid any caching side effects + * with the previous one. + */ + dm_disk_bitset_init(md->tm, &d->info); + d->nr_bits = min(d->writeset.nr_bits, md->nr_blocks); d->current_bit = 0; d->step = metadata_digest_transcribe_writeset; @@ -758,12 +774,6 @@ static int metadata_digest_start(struct era_metadata *md, struct digest *d) return 0; memset(d, 0, sizeof(*d)); - - /* - * We initialise another bitset info to avoid any caching side - * effects with the previous one. - */ - dm_disk_bitset_init(md->tm, &d->info); d->step = metadata_digest_lookup_writeset; return 0; @@ -801,6 +811,8 @@ static struct era_metadata *metadata_open(struct block_device *bdev, static void metadata_close(struct era_metadata *md) { + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); destroy_persistent_data_objects(md); kfree(md); } @@ -838,6 +850,7 @@ static int metadata_resize(struct era_metadata *md, void *arg) r = writeset_alloc(&md->writesets[1], *new_size); if (r) { DMERR("%s: writeset_alloc failed for writeset 1", __func__); + writeset_free(&md->writesets[0]); return r; } @@ -848,6 +861,8 @@ static int metadata_resize(struct era_metadata *md, void *arg) &value, &md->era_array_root); if (r) { DMERR("%s: dm_array_resize failed", __func__); + writeset_free(&md->writesets[0]); + writeset_free(&md->writesets[1]); return r; } @@ -869,7 +884,6 @@ static int metadata_era_archive(struct era_metadata *md) } ws_pack(&md->current_writeset->md, &value); - md->current_writeset->md.root = INVALID_WRITESET_ROOT; keys[0] = md->current_era; __dm_bless_for_disk(&value); @@ -881,6 +895,7 @@ static int metadata_era_archive(struct era_metadata *md) return r; } + md->current_writeset->md.root = INVALID_WRITESET_ROOT; md->archived_writesets = true; return 0; @@ -897,7 +912,7 @@ static int metadata_new_era(struct era_metadata *md) int r; struct writeset *new_writeset = next_writeset(md); - r = writeset_init(&md->bitset_info, new_writeset); + r = writeset_init(&md->bitset_info, new_writeset, md->nr_blocks); if (r) { DMERR("%s: writeset_init failed", __func__); return r; @@ -950,7 +965,7 @@ static int metadata_commit(struct era_metadata *md) int r; struct dm_block *sblock; - if (md->current_writeset->md.root != SUPERBLOCK_LOCATION) { + if (md->current_writeset->md.root != INVALID_WRITESET_ROOT) { r = dm_bitset_flush(&md->bitset_info, md->current_writeset->md.root, &md->current_writeset->md.root); if (r) { @@ -1225,8 +1240,10 @@ static void process_deferred_bios(struct era *era) int r; struct bio_list deferred_bios, marked_bios; struct bio *bio; + struct blk_plug plug; bool commit_needed = false; bool failed = false; + struct writeset *ws = era->md->current_writeset; bio_list_init(&deferred_bios); bio_list_init(&marked_bios); @@ -1236,9 +1253,11 @@ static void process_deferred_bios(struct era *era) bio_list_init(&era->deferred_bios); spin_unlock(&era->deferred_lock); + if (bio_list_empty(&deferred_bios)) + return; + while ((bio = bio_list_pop(&deferred_bios))) { - r = writeset_test_and_set(&era->md->bitset_info, - era->md->current_writeset, + r = writeset_test_and_set(&era->md->bitset_info, ws, get_block(era, bio)); if (r < 0) { /* @@ -1246,7 +1265,6 @@ static void process_deferred_bios(struct era *era) * FIXME: finish. */ failed = true; - } else if (r == 0) commit_needed = true; @@ -1262,9 +1280,19 @@ static void process_deferred_bios(struct era *era) if (failed) while ((bio = bio_list_pop(&marked_bios))) bio_io_error(bio); - else - while ((bio = bio_list_pop(&marked_bios))) + else { + blk_start_plug(&plug); + while ((bio = bio_list_pop(&marked_bios))) { + /* + * Only update the in-core writeset if the on-disk one + * was updated too. + */ + if (commit_needed) + set_bit(get_block(era, bio), ws->bits); generic_make_request(bio); + } + blk_finish_plug(&plug); + } } static void process_rpc_calls(struct era *era) @@ -1485,15 +1513,6 @@ static int era_ctr(struct dm_target *ti, unsigned argc, char **argv) } era->md = md; - era->nr_blocks = calc_nr_blocks(era); - - r = metadata_resize(era->md, &era->nr_blocks); - if (r) { - ti->error = "couldn't resize metadata"; - era_destroy(era); - return -ENOMEM; - } - era->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); if (!era->wq) { ti->error = "could not create workqueue for metadata object"; @@ -1571,16 +1590,24 @@ static int era_preresume(struct dm_target *ti) dm_block_t new_size = calc_nr_blocks(era); if (era->nr_blocks != new_size) { - r = in_worker1(era, metadata_resize, &new_size); - if (r) + r = metadata_resize(era->md, &new_size); + if (r) { + DMERR("%s: metadata_resize failed", __func__); + return r; + } + + r = metadata_commit(era->md); + if (r) { + DMERR("%s: metadata_commit failed", __func__); return r; + } era->nr_blocks = new_size; } start_worker(era); - r = in_worker0(era, metadata_new_era); + r = in_worker0(era, metadata_era_rollover); if (r) { DMERR("%s: metadata_era_rollover failed", __func__); return r; diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index ca948155191a..2f020401d5ba 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -529,7 +529,7 @@ static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_ * Grab our output buffer. */ nl = orig_nl = get_result_buffer(param, param_size, &len); - if (len < needed) { + if (len < needed || len < sizeof(nl->dev)) { param->flags |= DM_BUFFER_FULL_FLAG; goto out; } @@ -1574,6 +1574,7 @@ static int target_message(struct file *filp, struct dm_ioctl *param, size_t para if (!argc) { DMWARN("Empty message received."); + r = -EINVAL; goto out_argv; } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 8b7328666eaa..7c60aace8d25 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1815,7 +1815,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, int r; current_pgpath = READ_ONCE(m->current_pgpath); - if (!current_pgpath) + if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) current_pgpath = choose_pgpath(m, 0); if (current_pgpath) { diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index eadfcfd106ff..5cf64a823819 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -95,9 +95,6 @@ static void dm_old_stop_queue(struct request_queue *q) static void dm_mq_stop_queue(struct request_queue *q) { - if (blk_mq_queue_stopped(q)) - return; - blk_mq_quiesce_queue(q); } @@ -825,6 +822,7 @@ out_tag_set: blk_mq_free_tag_set(md->tag_set); out_kfree_tag_set: kfree(md->tag_set); + md->tag_set = NULL; return err; } @@ -834,6 +832,7 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md) if (md->tag_set) { blk_mq_free_tag_set(md->tag_set); kfree(md->tag_set); + md->tag_set = NULL; } } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 2170f6c118b8..fc57b2e40308 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -137,6 +137,11 @@ struct dm_snapshot { * for them to be committed. */ struct bio_list bios_queued_during_merge; + + /* + * Flush data after merge. + */ + struct bio flush_bio; }; /* @@ -788,7 +793,7 @@ static int dm_add_exception(void *context, chunk_t old, chunk_t new) static uint32_t __minimum_chunk_size(struct origin *o) { struct dm_snapshot *snap; - unsigned chunk_size = 0; + unsigned chunk_size = rounddown_pow_of_two(UINT_MAX); if (o) list_for_each_entry(snap, &o->snapshots, list) @@ -1060,6 +1065,17 @@ shut: static void error_bios(struct bio *bio); +static int flush_data(struct dm_snapshot *s) +{ + struct bio *flush_bio = &s->flush_bio; + + bio_reset(flush_bio); + bio_set_dev(flush_bio, s->origin->bdev); + flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + return submit_bio_wait(flush_bio); +} + static void merge_callback(int read_err, unsigned long write_err, void *context) { struct dm_snapshot *s = context; @@ -1073,6 +1089,11 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) goto shut; } + if (flush_data(s) < 0) { + DMERR("Flush after merge failed: shutting down merge"); + goto shut; + } + if (s->store->type->commit_merge(s->store, s->num_merging_chunks) < 0) { DMERR("Write error in exception store: shutting down merge"); @@ -1197,6 +1218,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->first_merging_chunk = 0; s->num_merging_chunks = 0; bio_list_init(&s->bios_queued_during_merge); + bio_init(&s->flush_bio, NULL, 0); /* Allocate hash table for COW data */ if (init_hash_tables(s)) { @@ -1264,6 +1286,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (!s->store->chunk_size) { ti->error = "Chunk size not set"; + r = -EINVAL; goto bad_read_metadata; } @@ -1391,6 +1414,8 @@ static void snapshot_dtr(struct dm_target *ti) mutex_destroy(&s->lock); + bio_uninit(&s->flush_bio); + dm_put_device(ti, s->cow); dm_put_device(ti, s->origin); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 8f070debe498..3b2a880eed68 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -431,14 +431,23 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, { int r; dev_t dev; + unsigned int major, minor; + char dummy; struct dm_dev_internal *dd; struct dm_table *t = ti->table; BUG_ON(!t); - dev = dm_get_dev_t(path); - if (!dev) - return -ENODEV; + if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { + /* Extract the major/minor numbers */ + dev = MKDEV(major, minor); + if (MAJOR(dev) != major || MINOR(dev) != minor) + return -EOVERFLOW; + } else { + dev = dm_get_dev_t(path); + if (!dev) + return -ENODEV; + } dd = find_device(&t->devices, dev); if (!dd) { @@ -547,14 +556,14 @@ static int adjoin(struct dm_table *table, struct dm_target *ti) * On the other hand, dm-switch needs to process bulk data using messages and * excessive use of GFP_NOIO could cause trouble. */ -static char **realloc_argv(unsigned *array_size, char **old_argv) +static char **realloc_argv(unsigned *size, char **old_argv) { char **argv; unsigned new_size; gfp_t gfp; - if (*array_size) { - new_size = *array_size * 2; + if (*size) { + new_size = *size * 2; gfp = GFP_KERNEL; } else { new_size = 8; @@ -562,8 +571,8 @@ static char **realloc_argv(unsigned *array_size, char **old_argv) } argv = kmalloc(new_size * sizeof(*argv), gfp); if (argv) { - memcpy(argv, old_argv, *array_size * sizeof(*argv)); - *array_size = new_size; + memcpy(argv, old_argv, *size * sizeof(*argv)); + *size = new_size; } kfree(old_argv); @@ -880,10 +889,10 @@ void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type) } EXPORT_SYMBOL_GPL(dm_table_set_type); -static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, +static int device_not_dax_capable(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - return bdev_dax_supported(dev->bdev, PAGE_SIZE); + return !bdev_dax_supported(dev->bdev, PAGE_SIZE); } static bool dm_table_supports_dax(struct dm_table *t) @@ -899,7 +908,7 @@ static bool dm_table_supports_dax(struct dm_table *t) return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_supports_dax, NULL)) + ti->type->iterate_devices(ti, device_not_dax_capable, NULL)) return false; } @@ -1295,12 +1304,6 @@ void dm_table_event_callback(struct dm_table *t, void dm_table_event(struct dm_table *t) { - /* - * You can no longer call dm_table_event() from interrupt - * context, use a bottom half instead. - */ - BUG_ON(in_interrupt()); - mutex_lock(&_event_lock); if (t->event_fn) t->event_fn(t->event_context); @@ -1348,6 +1351,46 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) return &t->targets[(KEYS_PER_NODE * n) + k]; } +/* + * type->iterate_devices() should be called when the sanity check needs to + * iterate and check all underlying data devices. iterate_devices() will + * iterate all underlying data devices until it encounters a non-zero return + * code, returned by whether the input iterate_devices_callout_fn, or + * iterate_devices() itself internally. + * + * For some target type (e.g. dm-stripe), one call of iterate_devices() may + * iterate multiple underlying devices internally, in which case a non-zero + * return code returned by iterate_devices_callout_fn will stop the iteration + * in advance. + * + * Cases requiring _any_ underlying device supporting some kind of attribute, + * should use the iteration structure like dm_table_any_dev_attr(), or call + * it directly. @func should handle semantics of positive examples, e.g. + * capable of something. + * + * Cases requiring _all_ underlying devices supporting some kind of attribute, + * should use the iteration structure like dm_table_supports_nowait() or + * dm_table_supports_discards(). Or introduce dm_table_all_devs_attr() that + * uses an @anti_func that handle semantics of counter examples, e.g. not + * capable of something. So: return !dm_table_any_dev_attr(t, anti_func, data); + */ +static bool dm_table_any_dev_attr(struct dm_table *t, + iterate_devices_callout_fn func, void *data) +{ + struct dm_target *ti; + unsigned int i; + + for (i = 0; i < dm_table_get_num_targets(t); i++) { + ti = dm_table_get_target(t, i); + + if (ti->type->iterate_devices && + ti->type->iterate_devices(ti, func, data)) + return true; + } + + return false; +} + static int count_device(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { @@ -1384,13 +1427,13 @@ bool dm_table_has_no_data_devices(struct dm_table *table) return true; } -static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); enum blk_zoned_model *zoned_model = data; - return q && blk_queue_zoned_model(q) == *zoned_model; + return !q || blk_queue_zoned_model(q) != *zoned_model; } static bool dm_table_supports_zoned_model(struct dm_table *t, @@ -1407,37 +1450,20 @@ static bool dm_table_supports_zoned_model(struct dm_table *t, return false; if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_is_zoned_model, &zoned_model)) + ti->type->iterate_devices(ti, device_not_zoned_model, &zoned_model)) return false; } return true; } -static int device_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); unsigned int *zone_sectors = data; - return q && blk_queue_zone_sectors(q) == *zone_sectors; -} - -static bool dm_table_matches_zone_sectors(struct dm_table *t, - unsigned int zone_sectors) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, device_matches_zone_sectors, &zone_sectors)) - return false; - } - - return true; + return !q || blk_queue_zone_sectors(q) != *zone_sectors; } static int validate_hardware_zoned_model(struct dm_table *table, @@ -1457,7 +1483,7 @@ static int validate_hardware_zoned_model(struct dm_table *table, if (!zone_sectors || !is_power_of_2(zone_sectors)) return -EINVAL; - if (!dm_table_matches_zone_sectors(table, zone_sectors)) { + if (dm_table_any_dev_attr(table, device_not_matches_zone_sectors, &zone_sectors)) { DMERR("%s: zone sectors is not consistent across all devices", dm_device_name(table->md)); return -EINVAL; @@ -1647,29 +1673,12 @@ static int device_dax_write_cache_enabled(struct dm_target *ti, return false; } -static int dm_table_supports_dax_write_cache(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, - device_dax_write_cache_enabled, NULL)) - return true; - } - - return false; -} - -static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int device_is_rotational(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && blk_queue_nonrot(q); + return q && !blk_queue_nonrot(q); } static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, @@ -1680,29 +1689,12 @@ static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev, return q && !blk_queue_add_random(q); } -static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int queue_no_sg_merge(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct request_queue *q = bdev_get_queue(dev->bdev); - return q && !test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); -} - -static bool dm_table_all_devices_attribute(struct dm_table *t, - iterate_devices_callout_fn func) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (!ti->type->iterate_devices || - !ti->type->iterate_devices(ti, func, NULL)) - return false; - } - - return true; + return q && test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags); } static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev, @@ -1801,27 +1793,6 @@ static int device_requires_stable_pages(struct dm_target *ti, return q && bdi_cap_stable_pages_required(q->backing_dev_info); } -/* - * If any underlying device requires stable pages, a table must require - * them as well. Only targets that support iterate_devices are considered: - * don't want error, zero, etc to require stable pages. - */ -static bool dm_table_requires_stable_pages(struct dm_table *t) -{ - struct dm_target *ti; - unsigned i; - - for (i = 0; i < dm_table_get_num_targets(t); i++) { - ti = dm_table_get_target(t, i); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, device_requires_stable_pages, NULL)) - return true; - } - - return false; -} - void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, struct queue_limits *limits) { @@ -1849,32 +1820,35 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else queue_flag_clear_unlocked(QUEUE_FLAG_DAX, q); - if (dm_table_supports_dax_write_cache(t)) + if (dm_table_any_dev_attr(t, device_dax_write_cache_enabled, NULL)) dax_write_cache(t->md->dax_dev, true); /* Ensure that all underlying devices are non-rotational. */ - if (dm_table_all_devices_attribute(t, device_is_nonrot)) - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); - else + if (dm_table_any_dev_attr(t, device_is_rotational, NULL)) queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, q); + else + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); if (!dm_table_supports_write_same(t)) q->limits.max_write_same_sectors = 0; if (!dm_table_supports_write_zeroes(t)) q->limits.max_write_zeroes_sectors = 0; - if (dm_table_all_devices_attribute(t, queue_supports_sg_merge)) - queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); - else + if (dm_table_any_dev_attr(t, queue_no_sg_merge, NULL)) queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + else + queue_flag_clear_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); dm_table_verify_integrity(t); /* * Some devices don't use blk_integrity but still want stable pages * because they do their own checksumming. + * If any underlying device requires stable pages, a table must require + * them as well. Only targets that support iterate_devices are considered: + * don't want error, zero, etc to require stable pages. */ - if (dm_table_requires_stable_pages(t)) + if (dm_table_any_dev_attr(t, device_requires_stable_pages, NULL)) q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; else q->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; @@ -1885,7 +1859,8 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, * Clear QUEUE_FLAG_ADD_RANDOM if any underlying device does not * have it set. */ - if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) + if (blk_queue_add_random(q) && + dm_table_any_dev_attr(t, device_is_not_random, NULL)) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); /* diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index b85a66f42814..29f8a632b42b 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -698,12 +698,16 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f THIN_MAX_CONCURRENT_LOCKS); if (IS_ERR(pmd->bm)) { DMERR("could not create block manager"); - return PTR_ERR(pmd->bm); + r = PTR_ERR(pmd->bm); + pmd->bm = NULL; + return r; } r = __open_or_format_metadata(pmd, format_device); - if (r) + if (r) { dm_block_manager_destroy(pmd->bm); + pmd->bm = NULL; + } return r; } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index e705799976c2..2dae30713eb3 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -552,6 +552,15 @@ static int verity_verify_io(struct dm_verity_io *io) } /* + * Skip verity work in response to I/O error when system is shutting down. + */ +static inline bool verity_is_system_shutting_down(void) +{ + return system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF + || system_state == SYSTEM_RESTART; +} + +/* * End one "io" structure with a given error. */ static void verity_finish_io(struct dm_verity_io *io, blk_status_t status) @@ -578,7 +587,8 @@ static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; - if (bio->bi_status && !verity_fec_is_enabled(io->v)) { + if (bio->bi_status && + (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) { verity_finish_io(io, bio->bi_status); return; } diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 4d658a0c6025..c6d3a4bc811c 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1580,7 +1580,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) return dzone; } - return ERR_PTR(-EBUSY); + return NULL; } /* @@ -1600,7 +1600,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) return zone; } - return ERR_PTR(-EBUSY); + return NULL; } /* diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 2fad512dce98..1015b200330b 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -350,8 +350,8 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd); - if (IS_ERR(dzone)) - return PTR_ERR(dzone); + if (!dzone) + return -EBUSY; start = jiffies; diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 497a2bc5da51..9d8d453b2735 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -789,7 +789,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) } /* Set target (no write same support) */ - ti->max_io_len = dev->zone_nr_sectors << 9; + ti->max_io_len = dev->zone_nr_sectors; ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_zeroes_bios = 1; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 02ba6849f89d..0e06432c958f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/blkpg.h> #include <linux/bio.h> @@ -471,7 +472,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, * subset of the parent bdev; require extra privileges. */ if (!capable(CAP_SYS_RAWIO)) { - DMWARN_LIMIT( + DMDEBUG_LIMIT( "%s: sending ioctl %x to DM device without required privilege.", current->comm, cmd); r = -ENOIOCTLCMD; @@ -2665,17 +2666,25 @@ EXPORT_SYMBOL_GPL(dm_internal_resume_fast); int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, unsigned cookie) { + int r; + unsigned noio_flag; char udev_cookie[DM_COOKIE_LENGTH]; char *envp[] = { udev_cookie, NULL }; + noio_flag = memalloc_noio_save(); + if (!cookie) - return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); + r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); else { snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", DM_COOKIE_ENV_VAR_NAME, cookie); - return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, - action, envp); + r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, + action, envp); } + + memalloc_noio_restore(noio_flag); + + return r; } uint32_t dm_next_uevent_seq(struct mapped_device *md) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 10057ac85476..71a196acdac1 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -659,9 +659,27 @@ out: * Takes the lock on the TOKEN lock resource so no other * node can communicate while the operation is underway. */ -static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) +static int lock_token(struct md_cluster_info *cinfo) { - int error, set_bit = 0; + int error; + + error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (error) { + pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", + __func__, __LINE__, error); + } else { + /* Lock the receive sequence */ + mutex_lock(&cinfo->recv_mutex); + } + return error; +} + +/* lock_comm() + * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. + */ +static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) +{ + int rv, set_bit = 0; struct mddev *mddev = cinfo->mddev; /* @@ -672,34 +690,19 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked) */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state)) { - error = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, + rv = test_and_set_bit_lock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - WARN_ON_ONCE(error); + WARN_ON_ONCE(rv); md_wakeup_thread(mddev->thread); set_bit = 1; } - error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); - if (set_bit) - clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); - if (error) - pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", - __func__, __LINE__, error); - - /* Lock the receive sequence */ - mutex_lock(&cinfo->recv_mutex); - return error; -} - -/* lock_comm() - * Sets the MD_CLUSTER_SEND_LOCK bit to lock the send channel. - */ -static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) -{ wait_event(cinfo->wait, !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state)); - - return lock_token(cinfo, mddev_locked); + rv = lock_token(cinfo); + if (set_bit) + clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); + return rv; } static void unlock_comm(struct md_cluster_info *cinfo) @@ -779,9 +782,11 @@ static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg, { int ret; - lock_comm(cinfo, mddev_locked); - ret = __sendmsg(cinfo, cmsg); - unlock_comm(cinfo); + ret = lock_comm(cinfo, mddev_locked); + if (!ret) { + ret = __sendmsg(cinfo, cmsg); + unlock_comm(cinfo); + } return ret; } @@ -1053,7 +1058,7 @@ static int metadata_update_start(struct mddev *mddev) return 0; } - ret = lock_token(cinfo, 1); + ret = lock_token(cinfo); clear_bit_unlock(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); return ret; } @@ -1171,7 +1176,10 @@ static void update_size(struct mddev *mddev, sector_t old_dev_sectors) int raid_slot = -1; md_update_sb(mddev, 1); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) { + pr_err("%s: lock_comm failed\n", __func__); + return; + } memset(&cmsg, 0, sizeof(cmsg)); cmsg.type = cpu_to_le32(METADATA_UPDATED); @@ -1310,7 +1318,8 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev) cmsg.type = cpu_to_le32(NEWDISK); memcpy(cmsg.uuid, uuid, 16); cmsg.raid_slot = cpu_to_le32(rdev->desc_nr); - lock_comm(cinfo, 1); + if (lock_comm(cinfo, 1)) + return -EAGAIN; ret = __sendmsg(cinfo, &cmsg); if (ret) { unlock_comm(cinfo); @@ -1423,6 +1432,7 @@ static void unlock_all_bitmaps(struct mddev *mddev) } } kfree(cinfo->other_bitmap_lockres); + cinfo->other_bitmap_lockres = NULL; } } diff --git a/drivers/md/md.c b/drivers/md/md.c index b942c74f1ce8..5e8706a66c31 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -568,8 +568,35 @@ void mddev_init(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init); +static struct mddev *mddev_find_locked(dev_t unit) +{ + struct mddev *mddev; + + list_for_each_entry(mddev, &all_mddevs, all_mddevs) + if (mddev->unit == unit) + return mddev; + + return NULL; +} + static struct mddev *mddev_find(dev_t unit) { + struct mddev *mddev; + + if (MAJOR(unit) != MD_MAJOR) + unit &= ~((1 << MdpMinorShift) - 1); + + spin_lock(&all_mddevs_lock); + mddev = mddev_find_locked(unit); + if (mddev) + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + + return mddev; +} + +static struct mddev *mddev_find_or_alloc(dev_t unit) +{ struct mddev *mddev, *new = NULL; if (unit && MAJOR(unit) != MD_MAJOR) @@ -579,13 +606,13 @@ static struct mddev *mddev_find(dev_t unit) spin_lock(&all_mddevs_lock); if (unit) { - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == unit) { - mddev_get(mddev); - spin_unlock(&all_mddevs_lock); - kfree(new); - return mddev; - } + mddev = mddev_find_locked(unit); + if (mddev) { + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + kfree(new); + return mddev; + } if (new) { list_add(&new->all_mddevs, &all_mddevs); @@ -611,12 +638,7 @@ static struct mddev *mddev_find(dev_t unit) return NULL; } - is_free = 1; - list_for_each_entry(mddev, &all_mddevs, all_mddevs) - if (mddev->unit == dev) { - is_free = 0; - break; - } + is_free = !mddev_find_locked(dev); } new->unit = dev; new->md_minor = MINOR(dev); @@ -1168,6 +1190,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0) + mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; @@ -1584,6 +1608,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && + sb->level != 0) + return -EINVAL; + if (!refdev) { ret = 1; } else { @@ -1694,6 +1722,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0 && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) + mddev->layout = -1; + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); @@ -5266,7 +5298,7 @@ static int md_alloc(dev_t dev, char *name) * writing to /sys/module/md_mod/parameters/new_array. */ static DEFINE_MUTEX(disks_mutex); - struct mddev *mddev = mddev_find(dev); + struct mddev *mddev = mddev_find_or_alloc(dev); struct gendisk *disk; int partitioned; int shift; @@ -5343,10 +5375,6 @@ static int md_alloc(dev_t dev, char *name) */ disk->flags |= GENHD_FL_EXT_DEVT; mddev->gendisk = disk; - /* As soon as we call add_disk(), another thread could get - * through to md_open, so make sure it doesn't get too far - */ - mutex_lock(&mddev->open_mutex); add_disk(disk); error = kobject_init_and_add(&mddev->kobj, &md_ktype, @@ -5362,7 +5390,6 @@ static int md_alloc(dev_t dev, char *name) if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_bitmap_group)) pr_debug("pointless warning\n"); - mutex_unlock(&mddev->open_mutex); abort: mutex_unlock(&disks_mutex); if (!error && mddev->kobj.sd) { @@ -6116,11 +6143,9 @@ static void autorun_devices(int part) md_probe(dev, NULL, NULL); mddev = mddev_find(dev); - if (!mddev || !mddev->gendisk) { - if (mddev) - mddev_put(mddev); + if (!mddev) break; - } + if (mddev_lock(mddev)) pr_warn("md: %s locked, cannot run\n", mdname(mddev)); else if (mddev->raid_disks || mddev->major_version @@ -6527,8 +6552,10 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) goto busy; kick_rdev: - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); + if (mddev_is_clustered(mddev)) { + if (md_cluster_ops->remove_disk(mddev, rdev)) + goto busy; + } md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); @@ -6757,6 +6784,9 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; + if (mddev->level == 0) + /* Cannot trust RAID0 layout info here */ + mddev->layout = -1; mddev->chunk_sectors = info->chunk_size >> 9; if (mddev->persistent) { @@ -7174,8 +7204,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto out; } - WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags)); - set_bit(MD_CLOSING, &mddev->flags); + if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + err = -EBUSY; + goto out; + } did_set_md_closing = true; mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); @@ -7411,9 +7444,9 @@ static int md_open(struct block_device *bdev, fmode_t mode) */ mddev_put(mddev); /* Wait until bdev->bd_disk is definitely gone */ - flush_workqueue(md_misc_wq); - /* Then retry the open from the top */ - return -ERESTARTSYS; + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); + return -EBUSY; } BUG_ON(mddev != bdev->bd_disk->private_data); @@ -8812,11 +8845,11 @@ void md_check_recovery(struct mddev *mddev) } if (mddev_is_clustered(mddev)) { - struct md_rdev *rdev; + struct md_rdev *rdev, *tmp; /* kick the device if another node issued a * remove disk. */ - rdev_for_each(rdev, mddev) { + rdev_for_each_safe(rdev, tmp, mddev) { if (test_and_clear_bit(ClusterRemove, &rdev->flags) && rdev->raid_disk < 0) md_kick_rdev_from_array(rdev); @@ -9116,7 +9149,7 @@ err_wq: static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - struct md_rdev *rdev2; + struct md_rdev *rdev2, *tmp; int role, ret; char b[BDEVNAME_SIZE]; @@ -9133,7 +9166,7 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) } /* Check for change of roles in the active devices */ - rdev_for_each(rdev2, mddev) { + rdev_for_each_safe(rdev2, tmp, mddev) { if (test_bit(Faulty, &rdev2->flags)) continue; diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index a240990a7f33..5673f8eb5f88 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h @@ -34,12 +34,12 @@ struct node_header { __le32 max_entries; __le32 value_size; __le32 padding; -} __packed; +} __attribute__((packed, aligned(8))); struct btree_node { struct node_header header; __le64 keys[0]; -} __packed; +} __attribute__((packed, aligned(8))); /* diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index eff04fa23dfa..63f2baed3c8a 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -423,9 +423,9 @@ static int rebalance_children(struct shadow_spine *s, memcpy(n, dm_block_data(child), dm_bm_block_size(dm_tm_get_bm(info->tm))); - dm_tm_unlock(info->tm, child); dm_tm_dec(info->tm, dm_block_location(child)); + dm_tm_unlock(info->tm, child); return 0; } @@ -549,7 +549,8 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, delete_at(n, index); } - *new_root = shadow_root(&spine); + if (!r) + *new_root = shadow_root(&spine); exit_shadow_spine(&spine); return r; diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index 97f16fe14f54..a2eceb12f01d 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -337,6 +337,8 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, */ begin = do_div(index_begin, ll->entries_per_block); end = do_div(end, ll->entries_per_block); + if (end == 0) + end = ll->entries_per_block; for (i = index_begin; i < index_end; i++, begin = 0) { struct dm_block *blk; diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index 8de63ce39bdd..87e17909ef52 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -33,7 +33,7 @@ struct disk_index_entry { __le64 blocknr; __le32 nr_free; __le32 none_free_before; -} __packed; +} __attribute__ ((packed, aligned(8))); #define MAX_METADATA_BITMAPS 255 @@ -43,7 +43,7 @@ struct disk_metadata_index { __le64 blocknr; struct disk_index_entry index[MAX_METADATA_BITMAPS]; -} __packed; +} __attribute__ ((packed, aligned(8))); struct ll_disk; @@ -86,7 +86,7 @@ struct disk_sm_root { __le64 nr_allocated; __le64 bitmap_root; __le64 ref_count_root; -} __packed; +} __attribute__ ((packed, aligned(8))); #define ENTRIES_PER_BYTE 4 @@ -94,7 +94,7 @@ struct disk_bitmap_header { __le32 csum; __le32 not_used; __le64 blocknr; -} __packed; +} __attribute__ ((packed, aligned(8))); enum allocation_event { SM_NONE, diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index bf4c5e2ccb6f..e0acae7a3815 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -171,6 +171,14 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smd->begin and the end of the metadata device. + * We search before smd->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, 0, smd->begin, b); + } + if (r) return r; @@ -199,7 +207,6 @@ static int sm_disk_commit(struct dm_space_map *sm) return r; memcpy(&smd->old_ll, &smd->ll, sizeof(smd->old_ll)); - smd->begin = 0; smd->nr_allocated_this_transaction = 0; r = sm_disk_get_nr_free(sm, &nr_free); diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 31a999458be9..b3ded452e573 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -451,6 +451,14 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) * Any block we allocate has to be free in both the old and current ll. */ r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b); + if (r == -ENOSPC) { + /* + * There's no free block between smm->begin and the end of the metadata device. + * We search before smm->begin in case something has been freed. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, 0, smm->begin, b); + } + if (r) return r; @@ -502,7 +510,6 @@ static int sm_metadata_commit(struct dm_space_map *sm) return r; memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); - smm->begin = 0; smm->allocated_this_transaction = 0; return 0; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index cdafa5e0ea6d..e179c121c030 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -152,6 +152,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (conf->nr_strip_zones == 1) { conf->layout = RAID0_ORIG_LAYOUT; + } else if (mddev->layout == RAID0_ORIG_LAYOUT || + mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = mddev->layout; } else if (default_layout == RAID0_ORIG_LAYOUT || default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { conf->layout = default_layout; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0a9d623b13c2..daa478e0b856 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -446,6 +446,8 @@ static void raid1_end_write_request(struct bio *bio) if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); else { + /* Fail the request */ + set_bit(R1BIO_Degraded, &r1_bio->state); /* Finished with this branch */ r1_bio->bios[mirror] = NULL; to_put = bio; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d08d77b9674f..419ecdd914f4 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1120,7 +1120,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct md_rdev *err_rdev = NULL; gfp_t gfp = GFP_NOIO; - if (r10_bio->devs[slot].rdev) { + if (slot >= 0 && r10_bio->devs[slot].rdev) { /* * This is an error retry, but we cannot * safely dereference the rdev in the r10_bio, @@ -1513,6 +1513,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; + r10_bio->read_slot = -1; memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); if (bio_data_dir(bio) == READ) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d5c14d56a714..ed55b02f9f89 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2415,8 +2415,6 @@ static int resize_stripes(struct r5conf *conf, int newsize) } else err = -ENOMEM; - mutex_unlock(&conf->cache_size_mutex); - conf->slab_cache = sc; conf->active_name = 1-conf->active_name; @@ -2439,6 +2437,8 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (!err) conf->pool_size = newsize; + mutex_unlock(&conf->cache_size_mutex); + return err; } @@ -3593,6 +3593,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * is missing/faulty, then we need to read everything we can. */ if (sh->raid_conf->level != 6 && + sh->raid_conf->rmw_level != PARITY_DISABLE_RMW && sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; @@ -4829,7 +4830,7 @@ static void handle_stripe(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) + || (s.to_write && s.failed) || (s.syncing && (s.uptodate + s.compute < disks)) || s.replacing || s.expanding) |