diff options
Diffstat (limited to 'drivers/md')
44 files changed, 718 insertions, 324 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 6f776823b9ba..a1df0d95151c 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -377,7 +377,10 @@ retry_invalidate: if (!fifo_full(&ca->free_inc)) goto retry_invalidate; - bch_prio_write(ca); + if (bch_prio_write(ca, false) < 0) { + ca->invalidate_needs_gc = 1; + wake_up_gc(ca->set); + } } } out: diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index e30a983a68cd..2e396954b39d 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -264,7 +264,7 @@ struct bcache_device { #define BCACHE_DEV_UNLINK_DONE 2 #define BCACHE_DEV_WB_RUNNING 3 #define BCACHE_DEV_RATE_DW_RUNNING 4 - unsigned int nr_stripes; + int nr_stripes; unsigned int stripe_size; atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; @@ -329,6 +329,9 @@ struct cached_dev { */ atomic_t has_dirty; +#define BCH_CACHE_READA_ALL 0 +#define BCH_CACHE_READA_META_ONLY 1 + unsigned int cache_readahead_policy; struct bch_ratelimit writeback_rate; struct delayed_work writeback_rate_update; @@ -977,7 +980,7 @@ bool bch_cached_dev_error(struct cached_dev *dc); __printf(2, 3) bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...); -void bch_prio_write(struct cache *ca); +int bch_prio_write(struct cache *ca, bool wait); void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent); extern struct workqueue_struct *bcache_wq; diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 268f1b685084..ec48cf86cab6 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -321,7 +321,7 @@ int bch_btree_keys_alloc(struct btree_keys *b, b->page_order = page_order; - t->data = (void *) __get_free_pages(gfp, b->page_order); + t->data = (void *) __get_free_pages(__GFP_COMP|gfp, b->page_order); if (!t->data) goto err; diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index c71365e7c1fa..a50dcfda656f 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -397,7 +397,8 @@ void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *state); /* Bkey utility code */ -#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, (i)->keys) +#define bset_bkey_last(i) bkey_idx((struct bkey *) (i)->d, \ + (unsigned int)(i)->keys) static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx) { diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 5cf3247e8afb..b60cb7a99b18 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -719,6 +719,8 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, * IO can always make forward progress: */ nr /= c->btree_pages; + if (nr == 0) + nr = 1; nr = min_t(unsigned long, nr, mca_can_free(c)); i = 0; @@ -834,7 +836,7 @@ int bch_btree_cache_alloc(struct cache_set *c) mutex_init(&c->verify_lock); c->verify_ondisk = (void *) - __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); + __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(bucket_pages(c))); c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); @@ -1436,7 +1438,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (__set_blocks(n1, n1->keys + n2->keys, block_bytes(b->c)) > btree_blocks(new_nodes[i])) - goto out_nocoalesce; + goto out_unlock_nocoalesce; keys = n2->keys; /* Take the key of the node we're getting rid of */ @@ -1465,7 +1467,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, if (__bch_keylist_realloc(&keylist, bkey_u64s(&new_nodes[i]->key))) - goto out_nocoalesce; + goto out_unlock_nocoalesce; bch_btree_node_write(new_nodes[i], &cl); bch_keylist_add(&keylist, &new_nodes[i]->key); @@ -1511,6 +1513,10 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op, /* Invalidated our iterator */ return -EINTR; +out_unlock_nocoalesce: + for (i = 0; i < nodes; i++) + mutex_unlock(&new_nodes[i]->write_lock); + out_nocoalesce: closure_sync(&cl); diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c index 33556acdcf9c..e145c2b1e2c9 100644 --- a/drivers/md/bcache/journal.c +++ b/drivers/md/bcache/journal.c @@ -865,8 +865,8 @@ int bch_journal_alloc(struct cache_set *c) j->w[1].c = c; if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || - !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) || - !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS))) + !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS)) || + !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, JSET_BITS))) return -ENOMEM; return 0; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 41adcd1546f1..4045ae748f17 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -391,13 +391,20 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) goto skip; /* - * Flag for bypass if the IO is for read-ahead or background, - * unless the read-ahead request is for metadata + * If the bio is for read-ahead or background IO, bypass it or + * not depends on the following situations, + * - If the IO is for meta data, always cache it and no bypass + * - If the IO is not meta data, check dc->cache_reada_policy, + * BCH_CACHE_READA_ALL: cache it and not bypass + * BCH_CACHE_READA_META_ONLY: not cache it and bypass + * That is, read-ahead request for metadata always get cached * (eg, for gfs2 or xfs). */ - if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && - !(bio->bi_opf & (REQ_META|REQ_PRIO))) - goto skip; + if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) { + if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) && + (dc->cache_readahead_policy != BCH_CACHE_READA_ALL)) + goto skip; + } if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || bio_sectors(bio) & (c->sb.block_size - 1)) { diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index ba1c93791d8d..503aafe188dc 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -109,9 +109,13 @@ int bch_cache_accounting_add_kobjs(struct cache_accounting *acc, void bch_cache_accounting_clear(struct cache_accounting *acc) { - memset(&acc->total.cache_hits, - 0, - sizeof(struct cache_stats)); + acc->total.cache_hits = 0; + acc->total.cache_misses = 0; + acc->total.cache_bypass_hits = 0; + acc->total.cache_bypass_misses = 0; + acc->total.cache_readaheads = 0; + acc->total.cache_miss_collisions = 0; + acc->total.sectors_bypassed = 0; } void bch_cache_accounting_destroy(struct cache_accounting *acc) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6daf777105fb..5248bca5e2ed 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -527,12 +527,29 @@ static void prio_io(struct cache *ca, uint64_t bucket, int op, closure_sync(cl); } -void bch_prio_write(struct cache *ca) +int bch_prio_write(struct cache *ca, bool wait) { int i; struct bucket *b; struct closure cl; + pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu", + fifo_used(&ca->free[RESERVE_PRIO]), + fifo_used(&ca->free[RESERVE_NONE]), + fifo_used(&ca->free_inc)); + + /* + * Pre-check if there are enough free buckets. In the non-blocking + * scenario it's better to fail early rather than starting to allocate + * buckets and do a cleanup later in case of failure. + */ + if (!wait) { + size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) + + fifo_used(&ca->free[RESERVE_NONE]); + if (prio_buckets(ca) > avail) + return -ENOMEM; + } + closure_init_stack(&cl); lockdep_assert_held(&ca->set->bucket_lock); @@ -542,9 +559,6 @@ void bch_prio_write(struct cache *ca) atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), &ca->meta_sectors_written); - //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), - // fifo_used(&ca->free_inc), fifo_used(&ca->unused)); - for (i = prio_buckets(ca) - 1; i >= 0; --i) { long bucket; struct prio_set *p = ca->disk_buckets; @@ -562,7 +576,7 @@ void bch_prio_write(struct cache *ca) p->magic = pset_magic(&ca->sb); p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); - bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true); + bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait); BUG_ON(bucket == -1); mutex_unlock(&ca->set->bucket_lock); @@ -591,6 +605,7 @@ void bch_prio_write(struct cache *ca) ca->prio_last_buckets[i] = ca->prio_buckets[i]; } + return 0; } static void prio_read(struct cache *ca, uint64_t bucket) @@ -754,20 +769,31 @@ static inline int idx_to_first_minor(int idx) static void bcache_device_free(struct bcache_device *d) { + struct gendisk *disk = d->disk; + lockdep_assert_held(&bch_register_lock); - pr_info("%s stopped", d->disk->disk_name); + if (disk) + pr_info("%s stopped", disk->disk_name); + else + pr_err("bcache device (NULL gendisk) stopped"); if (d->c) bcache_device_detach(d); - if (d->disk && d->disk->flags & GENHD_FL_UP) - del_gendisk(d->disk); - if (d->disk && d->disk->queue) - blk_cleanup_queue(d->disk->queue); - if (d->disk) { + + if (disk) { + bool disk_added = (disk->flags & GENHD_FL_UP) != 0; + + if (disk_added) + del_gendisk(disk); + + if (disk->queue) + blk_cleanup_queue(disk->queue); + ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(d->disk->first_minor)); - put_disk(d->disk); + first_minor_to_idx(disk->first_minor)); + if (disk_added) + put_disk(disk); } bioset_exit(&d->bio_split); @@ -1209,6 +1235,9 @@ static void cached_dev_free(struct closure *cl) mutex_unlock(&bch_register_lock); + if (dc->sb_bio.bi_inline_vecs[0].bv_page) + put_page(bio_first_page_all(&dc->sb_bio)); + if (!IS_ERR_OR_NULL(dc->bdev)) blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); @@ -1674,7 +1703,7 @@ void bch_cache_set_unregister(struct cache_set *c) } #define alloc_bucket_pages(gfp, c) \ - ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) + ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(bucket_pages(c)))) struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) { @@ -1886,7 +1915,7 @@ static int run_cache_set(struct cache_set *c) mutex_lock(&c->bucket_lock); for_each_cache(ca, c, i) - bch_prio_write(ca); + bch_prio_write(ca, true); mutex_unlock(&c->bucket_lock); err = "cannot allocate new UUID bucket"; @@ -1994,7 +2023,14 @@ found: sysfs_create_link(&c->kobj, &ca->kobj, buf)) goto err; - if (ca->sb.seq > c->sb.seq) { + /* + * A special case is both ca->sb.seq and c->sb.seq are 0, + * such condition happens on a new created cache device whose + * super block is never flushed yet. In this case c->sb.version + * and other members should be updated too, otherwise we will + * have a mistaken super block version in cache set. + */ + if (ca->sb.seq > c->sb.seq || c->sb.seq == 0) { c->sb.version = ca->sb.version; memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); c->sb.flags = ca->sb.flags; @@ -2274,29 +2310,35 @@ static bool bch_is_open(struct block_device *bdev) static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, const char *buffer, size_t size) { - ssize_t ret = -EINVAL; - const char *err = "cannot allocate memory"; + const char *err; char *path = NULL; - struct cache_sb *sb = NULL; + struct cache_sb *sb; struct block_device *bdev = NULL; - struct page *sb_page = NULL; + struct page *sb_page; + ssize_t ret; + ret = -EBUSY; + err = "failed to reference bcache module"; if (!try_module_get(THIS_MODULE)) - return -EBUSY; + goto out; /* For latest state of bcache_is_reboot */ smp_mb(); + err = "bcache is in reboot"; if (bcache_is_reboot) - return -EBUSY; + goto out_module_put; + ret = -ENOMEM; + err = "cannot allocate memory"; path = kstrndup(buffer, size, GFP_KERNEL); if (!path) - goto err; + goto out_module_put; sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL); if (!sb) - goto err; + goto out_free_path; + ret = -EINVAL; err = "failed to open device"; bdev = blkdev_get_by_path(strim(path), FMODE_READ|FMODE_WRITE|FMODE_EXCL, @@ -2313,57 +2355,69 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, if (!IS_ERR(bdev)) bdput(bdev); if (attr == &ksysfs_register_quiet) - goto quiet_out; + goto done; } - goto err; + goto out_free_sb; } err = "failed to set blocksize"; if (set_blocksize(bdev, 4096)) - goto err_close; + goto out_blkdev_put; err = read_super(sb, bdev, &sb_page); if (err) - goto err_close; + goto out_blkdev_put; err = "failed to register device"; if (SB_IS_BDEV(sb)) { struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); if (!dc) - goto err_close; + goto out_put_sb_page; mutex_lock(&bch_register_lock); ret = register_bdev(sb, sb_page, bdev, dc); mutex_unlock(&bch_register_lock); /* blkdev_put() will be called in cached_dev_free() */ - if (ret < 0) - goto err; + if (ret < 0) { + bdev = NULL; + goto out_put_sb_page; + } } else { struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) - goto err_close; + goto out_put_sb_page; /* blkdev_put() will be called in bch_cache_release() */ - if (register_cache(sb, sb_page, bdev, ca) != 0) - goto err; + if (register_cache(sb, sb_page, bdev, ca) != 0) { + bdev = NULL; + goto out_put_sb_page; + } } -quiet_out: - ret = size; -out: - if (sb_page) - put_page(sb_page); + + put_page(sb_page); +done: + kfree(sb); + kfree(path); + module_put(THIS_MODULE); + return size; + +out_put_sb_page: + put_page(sb_page); +out_blkdev_put: + if (bdev) + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +out_free_sb: kfree(sb); +out_free_path: kfree(path); + path = NULL; +out_module_put: module_put(THIS_MODULE); +out: + pr_info("error %s: %s", path?path:"", err); return ret; - -err_close: - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -err: - pr_info("error %s: %s", path, err); - goto out; } static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 54cd1727d20c..9d5cc9fc8d65 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -27,6 +27,12 @@ static const char * const bch_cache_modes[] = { NULL }; +static const char * const bch_reada_cache_policies[] = { + "all", + "meta-only", + NULL +}; + /* Default is 0 ("auto") */ static const char * const bch_stop_on_failure_modes[] = { "auto", @@ -100,6 +106,7 @@ rw_attribute(congested_write_threshold_us); rw_attribute(sequential_cutoff); rw_attribute(data_csum); rw_attribute(cache_mode); +rw_attribute(readahead_cache_policy); rw_attribute(stop_when_cache_set_failed); rw_attribute(writeback_metadata); rw_attribute(writeback_running); @@ -167,6 +174,11 @@ SHOW(__bch_cached_dev) bch_cache_modes, BDEV_CACHE_MODE(&dc->sb)); + if (attr == &sysfs_readahead_cache_policy) + return bch_snprint_string_list(buf, PAGE_SIZE, + bch_reada_cache_policies, + dc->cache_readahead_policy); + if (attr == &sysfs_stop_when_cache_set_failed) return bch_snprint_string_list(buf, PAGE_SIZE, bch_stop_on_failure_modes, @@ -349,6 +361,15 @@ STORE(__cached_dev) } } + if (attr == &sysfs_readahead_cache_policy) { + v = __sysfs_match_string(bch_reada_cache_policies, -1, buf); + if (v < 0) + return v; + + if ((unsigned int) v != dc->cache_readahead_policy) + dc->cache_readahead_policy = v; + } + if (attr == &sysfs_stop_when_cache_set_failed) { v = __sysfs_match_string(bch_stop_on_failure_modes, -1, buf); if (v < 0) @@ -463,6 +484,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_data_csum, #endif &sysfs_cache_mode, + &sysfs_readahead_cache_policy, &sysfs_stop_when_cache_set_failed, &sysfs_writeback_metadata, &sysfs_writeback_running, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index e9ffcea1ca50..58bedd51e11b 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -516,15 +516,19 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned int inode, uint64_t offset, int nr_sectors) { struct bcache_device *d = c->devices[inode]; - unsigned int stripe_offset, stripe, sectors_dirty; + unsigned int stripe_offset, sectors_dirty; + int stripe; if (!d) return; + stripe = offset_to_stripe(d, offset); + if (stripe < 0) + return; + if (UUID_FLASH_ONLY(&c->uuids[inode])) atomic_long_add(nr_sectors, &c->flash_dev_dirty_sectors); - stripe = offset_to_stripe(d, offset); stripe_offset = offset & (d->stripe_size - 1); while (nr_sectors) { @@ -564,12 +568,12 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) static void refill_full_stripes(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; - unsigned int start_stripe, stripe, next_stripe; + unsigned int start_stripe, next_stripe; + int stripe; bool wrapped = false; stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); - - if (stripe >= dc->disk.nr_stripes) + if (stripe < 0) stripe = 0; start_stripe = stripe; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 4e4c6810dc3c..c4ff76037227 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -33,10 +33,22 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } -static inline unsigned int offset_to_stripe(struct bcache_device *d, +static inline int offset_to_stripe(struct bcache_device *d, uint64_t offset) { do_div(offset, d->stripe_size); + + /* d->nr_stripes is in range [1, INT_MAX] */ + if (unlikely(offset >= d->nr_stripes)) { + pr_err("Invalid stripe %llu (>= nr_stripes %d).\n", + offset, d->nr_stripes); + return -EINVAL; + } + + /* + * Here offset is definitly smaller than INT_MAX, + * return it as int will never overflow. + */ return offset; } @@ -44,7 +56,10 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc, uint64_t offset, unsigned int nr_sectors) { - unsigned int stripe = offset_to_stripe(&dc->disk, offset); + int stripe = offset_to_stripe(&dc->disk, offset); + + if (stripe < 0) + return false; while (1) { if (atomic_read(dc->disk.stripe_sectors_dirty + stripe)) diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index c82578af56a5..2ea0360108e1 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -20,8 +20,13 @@ struct dm_bio_details { struct gendisk *bi_disk; u8 bi_partno; + int __bi_remaining; unsigned long bi_flags; struct bvec_iter bi_iter; + bio_end_io_t *bi_end_io; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + struct bio_integrity_payload *bi_integrity; +#endif }; static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) @@ -30,6 +35,11 @@ static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) bd->bi_partno = bio->bi_partno; bd->bi_flags = bio->bi_flags; bd->bi_iter = bio->bi_iter; + bd->__bi_remaining = atomic_read(&bio->__bi_remaining); + bd->bi_end_io = bio->bi_end_io; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + bd->bi_integrity = bio_integrity(bio); +#endif } static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) @@ -38,6 +48,11 @@ static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) bio->bi_partno = bd->bi_partno; bio->bi_flags = bd->bi_flags; bio->bi_iter = bd->bi_iter; + atomic_set(&bio->__bi_remaining, bd->__bi_remaining); + bio->bi_end_io = bd->bi_end_io; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + bio->bi_integrity = bd->bi_integrity; +#endif } #endif diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 8346e6d1816c..f595e9867cbe 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2867,8 +2867,8 @@ static void cache_postsuspend(struct dm_target *ti) prevent_background_work(cache); BUG_ON(atomic_read(&cache->nr_io_migrations)); - cancel_delayed_work(&cache->waker); - flush_workqueue(cache->wq); + cancel_delayed_work_sync(&cache->waker); + drain_workqueue(cache->wq); WARN_ON(cache->tracker.in_flight); /* diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index e78e4d3d3352..47f6065a31e8 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -481,8 +481,14 @@ static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti, const char *opts) { - unsigned bs = crypto_skcipher_blocksize(any_tfm(cc)); - int log = ilog2(bs); + unsigned bs; + int log; + + if (test_bit(CRYPT_MODE_INTEGRITY_AEAD, &cc->cipher_flags)) + bs = crypto_aead_blocksize(any_tfm_aead(cc)); + else + bs = crypto_skcipher_blocksize(any_tfm(cc)); + log = ilog2(bs); /* we need to calculate how far we must shift the sector count * to get the cipher block count, we use this shift in _gen */ @@ -3083,7 +3089,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits) limits->max_segment_size = PAGE_SIZE; limits->logical_block_size = - max_t(unsigned short, limits->logical_block_size, cc->sector_size); + max_t(unsigned, limits->logical_block_size, cc->sector_size); limits->physical_block_size = max_t(unsigned, limits->physical_block_size, cc->sector_size); limits->io_min = max_t(unsigned, limits->io_min, cc->sector_size); diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 29a5e5b4c63c..382b8694be50 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -6,6 +6,8 @@ * This file is released under the GPL. */ +#include "dm-bio-record.h" + #include <linux/compiler.h> #include <linux/module.h> #include <linux/device-mapper.h> @@ -199,17 +201,19 @@ struct dm_integrity_c { __u8 log2_blocks_per_bitmap_bit; unsigned char mode; - int suspending; int failed; struct crypto_shash *internal_hash; + struct dm_target *ti; + /* these variables are locked with endio_wait.lock */ struct rb_root in_progress; struct list_head wait_list; wait_queue_head_t endio_wait; struct workqueue_struct *wait_wq; + struct workqueue_struct *offload_wq; unsigned char commit_seq; commit_id_t commit_ids[N_COMMIT_IDS]; @@ -290,11 +294,7 @@ struct dm_integrity_io { struct completion *completion; - struct gendisk *orig_bi_disk; - u8 orig_bi_partno; - bio_end_io_t *orig_bi_end_io; - struct bio_integrity_payload *orig_bi_integrity; - struct bvec_iter orig_bi_iter; + struct dm_bio_details bio_details; }; struct journal_completion { @@ -1421,7 +1421,7 @@ static void dec_in_flight(struct dm_integrity_io *dio) dio->range.logical_sector += dio->range.n_sectors; bio_advance(bio, dio->range.n_sectors << SECTOR_SHIFT); INIT_WORK(&dio->work, integrity_bio_wait); - queue_work(ic->wait_wq, &dio->work); + queue_work(ic->offload_wq, &dio->work); return; } do_endio_flush(ic, dio); @@ -1432,14 +1432,9 @@ static void integrity_end_io(struct bio *bio) { struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io)); - bio->bi_iter = dio->orig_bi_iter; - bio->bi_disk = dio->orig_bi_disk; - bio->bi_partno = dio->orig_bi_partno; - if (dio->orig_bi_integrity) { - bio->bi_integrity = dio->orig_bi_integrity; + dm_bio_restore(&dio->bio_details, bio); + if (bio->bi_integrity) bio->bi_opf |= REQ_INTEGRITY; - } - bio->bi_end_io = dio->orig_bi_end_io; if (dio->completion) complete(dio->completion); @@ -1506,7 +1501,7 @@ static void integrity_metadata(struct work_struct *w) struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io)); char *checksums; unsigned extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0; - char checksums_onstack[HASH_MAX_DIGESTSIZE]; + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; unsigned sectors_to_process = dio->range.n_sectors; sector_t sector = dio->range.logical_sector; @@ -1524,7 +1519,7 @@ static void integrity_metadata(struct work_struct *w) } } - __bio_for_each_segment(bv, bio, iter, dio->orig_bi_iter) { + __bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) { unsigned pos; char *mem, *checksums_ptr; @@ -1568,7 +1563,7 @@ again: if (likely(checksums != checksums_onstack)) kfree(checksums); } else { - struct bio_integrity_payload *bip = dio->orig_bi_integrity; + struct bio_integrity_payload *bip = dio->bio_details.bi_integrity; if (bip) { struct bio_vec biv; @@ -1735,7 +1730,7 @@ retry_kmap: } while (++s < ic->sectors_per_block); #ifdef INTERNAL_VERIFY if (ic->internal_hash) { - char checksums_onstack[max(HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; + char checksums_onstack[max((size_t)HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { @@ -1847,7 +1842,7 @@ static void dm_integrity_map_continue(struct dm_integrity_io *dio, bool from_map if (need_sync_io && from_map) { INIT_WORK(&dio->work, integrity_bio_wait); - queue_work(ic->metadata_wq, &dio->work); + queue_work(ic->offload_wq, &dio->work); return; } @@ -1987,20 +1982,13 @@ offload_to_thread: } else dio->completion = NULL; - dio->orig_bi_iter = bio->bi_iter; - - dio->orig_bi_disk = bio->bi_disk; - dio->orig_bi_partno = bio->bi_partno; + dm_bio_record(&dio->bio_details, bio); bio_set_dev(bio, ic->dev->bdev); - - dio->orig_bi_integrity = bio_integrity(bio); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; - - dio->orig_bi_end_io = bio->bi_end_io; bio->bi_end_io = integrity_end_io; - bio->bi_iter.bi_size = dio->range.n_sectors << SECTOR_SHIFT; + generic_make_request(bio); if (need_sync_io) { @@ -2297,7 +2285,7 @@ static void integrity_writer(struct work_struct *w) unsigned prev_free_sectors; /* the following test is not needed, but it tests the replay code */ - if (READ_ONCE(ic->suspending) && !ic->meta_dev) + if (unlikely(dm_post_suspending(ic->ti)) && !ic->meta_dev) return; spin_lock_irq(&ic->endio_wait.lock); @@ -2359,7 +2347,7 @@ static void integrity_recalc(struct work_struct *w) next_chunk: - if (unlikely(READ_ONCE(ic->suspending))) + if (unlikely(dm_post_suspending(ic->ti))) goto unlock_ret; range.logical_sector = le64_to_cpu(ic->sb->recalc_sector); @@ -2484,7 +2472,7 @@ static void bitmap_block_work(struct work_struct *w) dio->range.n_sectors, BITMAP_OP_TEST_ALL_SET)) { remove_range(ic, &dio->range); INIT_WORK(&dio->work, integrity_bio_wait); - queue_work(ic->wait_wq, &dio->work); + queue_work(ic->offload_wq, &dio->work); } else { block_bitmap_op(ic, ic->journal, dio->range.logical_sector, dio->range.n_sectors, BITMAP_OP_SET); @@ -2507,7 +2495,7 @@ static void bitmap_block_work(struct work_struct *w) remove_range(ic, &dio->range); INIT_WORK(&dio->work, integrity_bio_wait); - queue_work(ic->wait_wq, &dio->work); + queue_work(ic->offload_wq, &dio->work); } queue_delayed_work(ic->commit_wq, &ic->bitmap_flush_work, ic->bitmap_flush_interval); @@ -2787,8 +2775,6 @@ static void dm_integrity_postsuspend(struct dm_target *ti) del_timer_sync(&ic->autocommit_timer); - WRITE_ONCE(ic->suspending, 1); - if (ic->recalc_wq) drain_workqueue(ic->recalc_wq); @@ -2817,8 +2803,6 @@ static void dm_integrity_postsuspend(struct dm_target *ti) #endif } - WRITE_ONCE(ic->suspending, 0); - BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress)); ic->journal_uptodate = true; @@ -2871,17 +2855,24 @@ static void dm_integrity_resume(struct dm_target *ti) } else { replay_journal(ic); if (ic->mode == 'B') { - int mode; ic->sb->flags |= cpu_to_le32(SB_FLAG_DIRTY_BITMAP); ic->sb->log2_blocks_per_bitmap_bit = ic->log2_blocks_per_bitmap_bit; r = sync_rw_sb(ic, REQ_OP_WRITE, REQ_FUA); if (unlikely(r)) dm_integrity_io_error(ic, "writing superblock", r); - mode = ic->recalculate_flag ? BITMAP_OP_SET : BITMAP_OP_CLEAR; - block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, mode); - block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, mode); - block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, mode); + block_bitmap_op(ic, ic->journal, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->recalc_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + block_bitmap_op(ic, ic->may_write_bitmap, 0, ic->provided_data_sectors, BITMAP_OP_CLEAR); + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) && + le64_to_cpu(ic->sb->recalc_sector) < ic->provided_data_sectors) { + block_bitmap_op(ic, ic->journal, le64_to_cpu(ic->sb->recalc_sector), + ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); + block_bitmap_op(ic, ic->recalc_bitmap, le64_to_cpu(ic->sb->recalc_sector), + ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); + block_bitmap_op(ic, ic->may_write_bitmap, le64_to_cpu(ic->sb->recalc_sector), + ic->provided_data_sectors - le64_to_cpu(ic->sb->recalc_sector), BITMAP_OP_SET); + } rw_journal_sectors(ic, REQ_OP_WRITE, REQ_FUA | REQ_SYNC, 0, ic->n_bitmap_blocks * (BITMAP_BLOCK_SIZE >> SECTOR_SHIFT), NULL); } @@ -2949,7 +2940,7 @@ static void dm_integrity_status(struct dm_target *ti, status_type_t type, DMEMIT(" meta_device:%s", ic->meta_dev->name); if (ic->sectors_per_block != 1) DMEMIT(" block_size:%u", ic->sectors_per_block << SECTOR_SHIFT); - if (ic->recalculate_flag) + if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) DMEMIT(" recalculate"); DMEMIT(" journal_sectors:%u", ic->initial_sectors - SB_SECTORS); DMEMIT(" interleave_sectors:%u", 1U << ic->sb->log2_interleave_sectors); @@ -3596,6 +3587,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) } ti->private = ic; ti->per_io_data_size = sizeof(struct dm_integrity_io); + ic->ti = ti; ic->in_progress = RB_ROOT; INIT_LIST_HEAD(&ic->wait_list); @@ -3807,6 +3799,14 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) goto bad; } + ic->offload_wq = alloc_workqueue("dm-integrity-offload", WQ_MEM_RECLAIM, + METADATA_WORKQUEUE_MAX_ACTIVE); + if (!ic->offload_wq) { + ti->error = "Cannot allocate workqueue"; + r = -ENOMEM; + goto bad; + } + ic->commit_wq = alloc_workqueue("dm-integrity-commit", WQ_MEM_RECLAIM, 1); if (!ic->commit_wq) { ti->error = "Cannot allocate workqueue"; @@ -4111,6 +4111,8 @@ static void dm_integrity_dtr(struct dm_target *ti) destroy_workqueue(ic->metadata_wq); if (ic->wait_wq) destroy_workqueue(ic->wait_wq); + if (ic->offload_wq) + destroy_workqueue(ic->offload_wq); if (ic->commit_wq) destroy_workqueue(ic->commit_wq); if (ic->writer_wq) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index dbcc1e41cd57..f2de4c73cc8f 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -576,10 +576,12 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) /* Do we need to select a new pgpath? */ pgpath = READ_ONCE(m->current_pgpath); - queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); - if (!pgpath || !queue_io) + if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) pgpath = choose_pgpath(m, bio->bi_iter.bi_size); + /* MPATHF_QUEUE_IO might have been cleared by choose_pgpath. */ + queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); + if ((pgpath && queue_io) || (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { /* Queue for the daemon to resubmit */ @@ -599,45 +601,10 @@ static struct pgpath *__map_bio(struct multipath *m, struct bio *bio) return pgpath; } -static struct pgpath *__map_bio_fast(struct multipath *m, struct bio *bio) -{ - struct pgpath *pgpath; - unsigned long flags; - - /* Do we need to select a new pgpath? */ - /* - * FIXME: currently only switching path if no path (due to failure, etc) - * - which negates the point of using a path selector - */ - pgpath = READ_ONCE(m->current_pgpath); - if (!pgpath) - pgpath = choose_pgpath(m, bio->bi_iter.bi_size); - - if (!pgpath) { - if (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { - /* Queue for the daemon to resubmit */ - spin_lock_irqsave(&m->lock, flags); - bio_list_add(&m->queued_bios, bio); - spin_unlock_irqrestore(&m->lock, flags); - queue_work(kmultipathd, &m->process_queued_bios); - - return ERR_PTR(-EAGAIN); - } - return NULL; - } - - return pgpath; -} - static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio) { - struct pgpath *pgpath; - - if (!m->hw_handler_name) - pgpath = __map_bio_fast(m, bio); - else - pgpath = __map_bio(m, bio); + struct pgpath *pgpath = __map_bio(m, bio); if (IS_ERR(pgpath)) return DM_MAPIO_SUBMITTED; @@ -1889,7 +1856,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, int r; current_pgpath = READ_ONCE(m->current_pgpath); - if (!current_pgpath) + if (!current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) current_pgpath = choose_pgpath(m, 0); if (current_pgpath) { diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index b41ecb451c78..99784f16cb56 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -70,9 +70,6 @@ void dm_start_queue(struct request_queue *q) void dm_stop_queue(struct request_queue *q) { - if (blk_mq_queue_stopped(q)) - return; - blk_mq_quiesce_queue(q); } diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 3c50c4e4da8f..963d3774c93e 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -17,7 +17,7 @@ #include <linux/dm-bufio.h> #define DM_MSG_PREFIX "persistent snapshot" -#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ +#define DM_CHUNK_SIZE_DEFAULT_SECTORS 32U /* 16KB */ #define DM_PREFETCH_CHUNKS 12 diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 4c68a7b93d5e..4cd8868f8004 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -189,6 +189,15 @@ struct dm_pool_metadata { sector_t data_block_size; /* + * Pre-commit callback. + * + * This allows the thin provisioning target to run a callback before + * the metadata are committed. + */ + dm_pool_pre_commit_fn pre_commit_fn; + void *pre_commit_context; + + /* * We reserve a section of the metadata for commit overhead. * All reported space does *not* include this. */ @@ -378,16 +387,15 @@ static int subtree_equal(void *context, const void *value1_le, const void *value * Variant that is used for in-core only changes or code that * shouldn't put the pool in service on its own (e.g. commit). */ -static inline void __pmd_write_lock(struct dm_pool_metadata *pmd) +static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd) __acquires(pmd->root_lock) { down_write(&pmd->root_lock); } -#define pmd_write_lock_in_core(pmd) __pmd_write_lock((pmd)) static inline void pmd_write_lock(struct dm_pool_metadata *pmd) { - __pmd_write_lock(pmd); + pmd_write_lock_in_core(pmd); if (unlikely(!pmd->in_service)) pmd->in_service = true; } @@ -822,10 +830,19 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) * We need to know if the thin_disk_superblock exceeds a 512-byte sector. */ BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); + BUG_ON(!rwsem_is_locked(&pmd->root_lock)); if (unlikely(!pmd->in_service)) return 0; + if (pmd->pre_commit_fn) { + r = pmd->pre_commit_fn(pmd->pre_commit_context); + if (r < 0) { + DMERR("pre-commit callback failed"); + return r; + } + } + r = __write_changed_details(pmd); if (r < 0) return r; @@ -892,6 +909,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, pmd->in_service = false; pmd->bdev = bdev; pmd->data_block_size = data_block_size; + pmd->pre_commit_fn = NULL; + pmd->pre_commit_context = NULL; r = __create_persistent_data_objects(pmd, format_device); if (r) { @@ -934,12 +953,14 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) return -EBUSY; } + pmd_write_lock_in_core(pmd); if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) { r = __commit_transaction(pmd); if (r < 0) DMWARN("%s: __commit_transaction() failed, error = %d", __func__, r); } + pmd_write_unlock(pmd); if (!pmd->fail_io) __destroy_persistent_data_objects(pmd); @@ -1822,7 +1843,7 @@ int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) * Care is taken to not have commit be what * triggers putting the thin-pool in-service. */ - __pmd_write_lock(pmd); + pmd_write_lock_in_core(pmd); if (pmd->fail_io) goto out; @@ -2044,6 +2065,16 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, return r; } +void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd, + dm_pool_pre_commit_fn fn, + void *context) +{ + pmd_write_lock_in_core(pmd); + pmd->pre_commit_fn = fn; + pmd->pre_commit_context = context; + pmd_write_unlock(pmd); +} + int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) { int r = -EINVAL; diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index f6be0d733c20..7ef56bd2a7e3 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h @@ -230,6 +230,13 @@ bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd); */ void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd); +/* Pre-commit callback */ +typedef int (*dm_pool_pre_commit_fn)(void *context); + +void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd, + dm_pool_pre_commit_fn fn, + void *context); + /*----------------------------------------------------------------*/ #endif diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fcd887703f95..1b2c98b43519 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -231,6 +231,7 @@ struct pool { struct dm_target *ti; /* Only set if a pool target is bound */ struct mapped_device *pool_md; + struct block_device *data_dev; struct block_device *md_dev; struct dm_pool_metadata *pmd; @@ -328,6 +329,7 @@ struct pool_c { dm_block_t low_water_blocks; struct pool_features requested_pf; /* Features requested during table load */ struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */ + struct bio flush_bio; }; /* @@ -2392,8 +2394,16 @@ static void process_deferred_bios(struct pool *pool) while ((bio = bio_list_pop(&bio_completions))) bio_endio(bio); - while ((bio = bio_list_pop(&bios))) - generic_make_request(bio); + while ((bio = bio_list_pop(&bios))) { + /* + * The data device was flushed as part of metadata commit, + * so complete redundant flushes immediately. + */ + if (bio->bi_opf & REQ_PREFLUSH) + bio_endio(bio); + else + generic_make_request(bio); + } } static void do_worker(struct work_struct *ws) @@ -2936,6 +2946,7 @@ static struct kmem_cache *_new_mapping_cache; static struct pool *pool_create(struct mapped_device *pool_md, struct block_device *metadata_dev, + struct block_device *data_dev, unsigned long block_size, int read_only, char **error) { @@ -3043,6 +3054,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, pool->last_commit_jiffies = jiffies; pool->pool_md = pool_md; pool->md_dev = metadata_dev; + pool->data_dev = data_dev; __pool_table_insert(pool); return pool; @@ -3084,6 +3096,7 @@ static void __pool_dec(struct pool *pool) static struct pool *__pool_find(struct mapped_device *pool_md, struct block_device *metadata_dev, + struct block_device *data_dev, unsigned long block_size, int read_only, char **error, int *created) { @@ -3094,19 +3107,23 @@ static struct pool *__pool_find(struct mapped_device *pool_md, *error = "metadata device already in use by a pool"; return ERR_PTR(-EBUSY); } + if (pool->data_dev != data_dev) { + *error = "data device already in use by a pool"; + return ERR_PTR(-EBUSY); + } __pool_inc(pool); } else { pool = __pool_table_lookup(pool_md); if (pool) { - if (pool->md_dev != metadata_dev) { + if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) { *error = "different pool cannot replace a pool"; return ERR_PTR(-EINVAL); } __pool_inc(pool); } else { - pool = pool_create(pool_md, metadata_dev, block_size, read_only, error); + pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error); *created = 1; } } @@ -3127,6 +3144,7 @@ static void pool_dtr(struct dm_target *ti) __pool_dec(pt->pool); dm_put_device(ti, pt->metadata_dev); dm_put_device(ti, pt->data_dev); + bio_uninit(&pt->flush_bio); kfree(pt); mutex_unlock(&dm_thin_pool_table.mutex); @@ -3192,6 +3210,29 @@ static void metadata_low_callback(void *context) dm_table_event(pool->ti->table); } +/* + * We need to flush the data device **before** committing the metadata. + * + * This ensures that the data blocks of any newly inserted mappings are + * properly written to non-volatile storage and won't be lost in case of a + * crash. + * + * Failure to do so can result in data corruption in the case of internal or + * external snapshots and in the case of newly provisioned blocks, when block + * zeroing is enabled. + */ +static int metadata_pre_commit_callback(void *context) +{ + struct pool_c *pt = context; + struct bio *flush_bio = &pt->flush_bio; + + bio_reset(flush_bio); + bio_set_dev(flush_bio, pt->data_dev->bdev); + flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + return submit_bio_wait(flush_bio); +} + static sector_t get_dev_size(struct block_device *bdev) { return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; @@ -3335,7 +3376,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) goto out; } - pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, + pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev, block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created); if (IS_ERR(pool)) { r = PTR_ERR(pool); @@ -3360,6 +3401,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) pt->data_dev = data_dev; pt->low_water_blocks = low_water_blocks; pt->adjusted_pf = pt->requested_pf = pf; + bio_init(&pt->flush_bio, NULL, 0); ti->num_flush_bios = 1; /* @@ -3549,6 +3591,9 @@ static int pool_preresume(struct dm_target *ti) if (r) return r; + dm_pool_register_pre_commit_callback(pool->pmd, + metadata_pre_commit_callback, pt); + r = maybe_resize_data_dev(ti, &need_commit1); if (r) return r; @@ -4077,7 +4122,7 @@ static struct target_type pool_target = { .name = "thin-pool", .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | DM_TARGET_IMMUTABLE, - .version = {1, 21, 0}, + .version = {1, 22, 0}, .module = THIS_MODULE, .ctr = pool_ctr, .dtr = pool_dtr, @@ -4456,7 +4501,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type thin_target = { .name = "thin", - .version = {1, 21, 0}, + .version = {1, 22, 0}, .module = THIS_MODULE, .ctr = thin_ctr, .dtr = thin_dtr, diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c index 3ceeb6b404ed..fb41b4f23c48 100644 --- a/drivers/md/dm-verity-fec.c +++ b/drivers/md/dm-verity-fec.c @@ -435,7 +435,7 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, fio->level++; if (type == DM_VERITY_BLOCK_TYPE_METADATA) - block += v->data_blocks; + block = block - v->hash_start + v->data_blocks; /* * For RS(M, N), the continuous FEC data is divided into blocks of N @@ -551,6 +551,7 @@ void verity_fec_dtr(struct dm_verity *v) mempool_exit(&f->rs_pool); mempool_exit(&f->prealloc_pool); mempool_exit(&f->extra_pool); + mempool_exit(&f->output_pool); kmem_cache_destroy(f->cache); if (f->data_bufio) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 1cb137f0ef9d..2727a525b5bb 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -280,6 +280,8 @@ static int persistent_memory_claim(struct dm_writecache *wc) while (daa-- && i < p) { pages[i++] = pfn_t_to_page(pfn); pfn.val++; + if (!(i & 15)) + cond_resched(); } } while (i < p); wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL); @@ -443,7 +445,13 @@ static void writecache_notify_io(unsigned long error, void *context) complete(&endio->c); } -static void ssd_commit_flushed(struct dm_writecache *wc) +static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) +{ + wait_event(wc->bio_in_progress_wait[direction], + !atomic_read(&wc->bio_in_progress[direction])); +} + +static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { struct dm_io_region region; struct dm_io_request req; @@ -489,17 +497,20 @@ static void ssd_commit_flushed(struct dm_writecache *wc) writecache_notify_io(0, &endio); wait_for_completion_io(&endio.c); + if (wait_for_ios) + writecache_wait_for_ios(wc, WRITE); + writecache_disk_flush(wc, wc->ssd_dev); memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size); } -static void writecache_commit_flushed(struct dm_writecache *wc) +static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { if (WC_MODE_PMEM(wc)) wmb(); else - ssd_commit_flushed(wc); + ssd_commit_flushed(wc, wait_for_ios); } static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) @@ -523,12 +534,6 @@ static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev) writecache_error(wc, r, "error flushing metadata: %d", r); } -static void writecache_wait_for_ios(struct dm_writecache *wc, int direction) -{ - wait_event(wc->bio_in_progress_wait[direction], - !atomic_read(&wc->bio_in_progress[direction])); -} - #define WFE_RETURN_FOLLOWING 1 #define WFE_LOWEST_SEQ 2 @@ -623,6 +628,12 @@ static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry wc->freelist_size++; } +static inline void writecache_verify_watermark(struct dm_writecache *wc) +{ + if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) + queue_work(wc->writeback_wq, &wc->writeback_work); +} + static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) { struct wc_entry *e; @@ -644,8 +655,8 @@ static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc) list_del(&e->lru); } wc->freelist_size--; - if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) - queue_work(wc->writeback_wq, &wc->writeback_work); + + writecache_verify_watermark(wc); return e; } @@ -725,14 +736,12 @@ static void writecache_flush(struct dm_writecache *wc) e = e2; cond_resched(); } - writecache_commit_flushed(wc); - - writecache_wait_for_ios(wc, WRITE); + writecache_commit_flushed(wc, true); wc->seq_count++; pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count)); writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); wc->overwrote_committed = false; @@ -756,7 +765,7 @@ static void writecache_flush(struct dm_writecache *wc) } if (need_flush_after_free) - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); } static void writecache_flush_work(struct work_struct *work) @@ -799,6 +808,8 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ writecache_wait_for_ios(wc, WRITE); discarded_something = true; } + if (!writecache_entry_is_committed(wc, e)) + wc->uncommitted_blocks--; writecache_free_entry(wc, e); } @@ -809,7 +820,7 @@ static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_ } if (discarded_something) - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); } static bool writecache_wait_for_writeback(struct dm_writecache *wc) @@ -838,7 +849,7 @@ static void writecache_suspend(struct dm_target *ti) } wc_unlock(wc); - flush_workqueue(wc->writeback_wq); + drain_workqueue(wc->writeback_wq); wc_lock(wc); if (flush_on_suspend) @@ -866,11 +877,30 @@ static int writecache_alloc_entries(struct dm_writecache *wc) struct wc_entry *e = &wc->entries[b]; e->index = b; e->write_in_progress = false; + cond_resched(); } return 0; } +static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors) +{ + struct dm_io_region region; + struct dm_io_request req; + + region.bdev = wc->ssd_dev->bdev; + region.sector = wc->start_sector; + region.count = n_sectors; + req.bi_op = REQ_OP_READ; + req.bi_op_flags = REQ_SYNC; + req.mem.type = DM_IO_VMA; + req.mem.ptr.vma = (char *)wc->memory_map; + req.client = wc->dm_io; + req.notify.fn = NULL; + + return dm_io(&req, 1, ®ion, NULL); +} + static void writecache_resume(struct dm_target *ti) { struct dm_writecache *wc = ti->private; @@ -881,8 +911,18 @@ static void writecache_resume(struct dm_target *ti) wc_lock(wc); - if (WC_MODE_PMEM(wc)) + if (WC_MODE_PMEM(wc)) { persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); + } else { + r = writecache_read_metadata(wc, wc->metadata_sectors); + if (r) { + size_t sb_entries_offset; + writecache_error(wc, r, "unable to read metadata: %d", r); + sb_entries_offset = offsetof(struct wc_memory_superblock, entries); + memset((char *)wc->memory_map + sb_entries_offset, -1, + (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset); + } + } wc->tree = RB_ROOT; INIT_LIST_HEAD(&wc->lru); @@ -920,6 +960,7 @@ static void writecache_resume(struct dm_target *ti) e->original_sector = le64_to_cpu(wme.original_sector); e->seq_count = le64_to_cpu(wme.seq_count); } + cond_resched(); } #endif for (b = 0; b < wc->n_blocks; b++) { @@ -958,9 +999,11 @@ erase_this: if (need_flush) { writecache_flush_all_metadata(wc); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); } + writecache_verify_watermark(wc); + wc_unlock(wc); } @@ -1218,7 +1261,8 @@ bio_copy: } } while (bio->bi_iter.bi_size); - if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) + if (unlikely(bio->bi_opf & REQ_FUA || + wc->uncommitted_blocks >= wc->autocommit_blocks)) writecache_flush(wc); else writecache_schedule_autocommit(wc); @@ -1341,7 +1385,7 @@ static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head * wc->writeback_size--; n_walked++; if (unlikely(n_walked >= ENDIO_LATENCY)) { - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); wc_unlock(wc); wc_lock(wc); n_walked = 0; @@ -1422,7 +1466,7 @@ pop_from_list: writecache_wait_for_ios(wc, READ); } - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); wc_unlock(wc); } @@ -1748,14 +1792,16 @@ static int init_memory(struct dm_writecache *wc) pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks)); pmem_assign(sb(wc)->seq_count, cpu_to_le64(0)); - for (b = 0; b < wc->n_blocks; b++) + for (b = 0; b < wc->n_blocks; b++) { write_original_sector_seq_count(wc, &wc->entries[b], -1, -1); + cond_resched(); + } writecache_flush_all_metadata(wc); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC)); writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic); - writecache_commit_flushed(wc); + writecache_commit_flushed(wc, false); return 0; } @@ -1958,6 +2004,12 @@ static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv) ti->error = "Invalid block size"; goto bad; } + if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) || + wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) { + r = -EINVAL; + ti->error = "Block size is smaller than device logical block size"; + goto bad; + } wc->block_size_bits = __ffs(wc->block_size); wc->max_writeback_jobs = MAX_WRITEBACK_JOBS; @@ -2046,8 +2098,6 @@ invalid_optional: goto bad; } } else { - struct dm_io_region region; - struct dm_io_request req; size_t n_blocks, n_metadata_blocks; uint64_t n_bitmap_bits; @@ -2104,19 +2154,9 @@ invalid_optional: goto bad; } - region.bdev = wc->ssd_dev->bdev; - region.sector = wc->start_sector; - region.count = wc->metadata_sectors; - req.bi_op = REQ_OP_READ; - req.bi_op_flags = REQ_SYNC; - req.mem.type = DM_IO_VMA; - req.mem.ptr.vma = (char *)wc->memory_map; - req.client = wc->dm_io; - req.notify.fn = NULL; - - r = dm_io(&req, 1, ®ion, NULL); + r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT); if (r) { - ti->error = "Unable to read metadata"; + ti->error = "Unable to read first block of metadata"; goto bad; } } diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 7e8d7fc99410..5c2bbdf67f25 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -132,6 +132,7 @@ struct dmz_metadata { sector_t zone_bitmap_size; unsigned int zone_nr_bitmap_blocks; + unsigned int zone_bits_per_mblk; unsigned int nr_bitmap_blocks; unsigned int nr_map_blocks; @@ -552,6 +553,7 @@ static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { dmz_release_mblock(zmd, mblk); + dmz_check_bdev(zmd->dev); return ERR_PTR(-EIO); } @@ -623,6 +625,8 @@ static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block, ret = submit_bio_wait(bio); bio_put(bio); + if (ret) + dmz_check_bdev(zmd->dev); return ret; } @@ -689,6 +693,7 @@ static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd, TASK_UNINTERRUPTIBLE); if (test_bit(DMZ_META_ERROR, &mblk->state)) { clear_bit(DMZ_META_ERROR, &mblk->state); + dmz_check_bdev(zmd->dev); ret = -EIO; } nr_mblks_submitted--; @@ -766,7 +771,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) /* If there are no dirty metadata blocks, just flush the device cache */ if (list_empty(&write_list)) { ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL); - goto out; + goto err; } /* @@ -776,7 +781,7 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_log_dirty_mblocks(zmd, &write_list); if (ret) - goto out; + goto err; /* * The log is on disk. It is now safe to update in place @@ -784,11 +789,11 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) */ ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary); if (ret) - goto out; + goto err; ret = dmz_write_sb(zmd, zmd->mblk_primary); if (ret) - goto out; + goto err; while (!list_empty(&write_list)) { mblk = list_first_entry(&write_list, struct dmz_mblock, link); @@ -803,16 +808,20 @@ int dmz_flush_metadata(struct dmz_metadata *zmd) zmd->sb_gen++; out: - if (ret && !list_empty(&write_list)) { - spin_lock(&zmd->mblk_lock); - list_splice(&write_list, &zmd->mblk_dirty_list); - spin_unlock(&zmd->mblk_lock); - } - dmz_unlock_flush(zmd); up_write(&zmd->mblk_sem); return ret; + +err: + if (!list_empty(&write_list)) { + spin_lock(&zmd->mblk_lock); + list_splice(&write_list, &zmd->mblk_dirty_list); + spin_unlock(&zmd->mblk_lock); + } + if (!dmz_check_bdev(zmd->dev)) + ret = -EIO; + goto out; } /* @@ -1096,7 +1105,6 @@ static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone, if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) { set_bit(DMZ_RND, &zone->flags); - zmd->nr_rnd_zones++; } else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ || blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) { set_bit(DMZ_SEQ, &zone->flags); @@ -1157,7 +1165,10 @@ static int dmz_init_zones(struct dmz_metadata *zmd) /* Init */ zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3; - zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT; + zmd->zone_nr_bitmap_blocks = + max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT); + zmd->zone_bits_per_mblk = min_t(sector_t, dev->zone_nr_blocks, + DMZ_BLOCK_SIZE_BITS); /* Allocate zone array */ zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL); @@ -1235,6 +1246,7 @@ static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (ret) { dmz_dev_err(zmd->dev, "Get zone %u report failed", dmz_id(zmd, zone)); + dmz_check_bdev(zmd->dev); return ret; } @@ -1568,7 +1580,7 @@ static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd) return dzone; } - return ERR_PTR(-EBUSY); + return NULL; } /* @@ -1588,7 +1600,7 @@ static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd) return zone; } - return ERR_PTR(-EBUSY); + return NULL; } /* @@ -1973,7 +1985,7 @@ int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone, dmz_release_mblock(zmd, to_mblk); dmz_release_mblock(zmd, from_mblk); - chunk_block += DMZ_BLOCK_SIZE_BITS; + chunk_block += zmd->zone_bits_per_mblk; } to_zone->weight = from_zone->weight; @@ -2034,7 +2046,7 @@ int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, /* Set bits */ bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits); if (count) { @@ -2113,7 +2125,7 @@ int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone, /* Clear bits */ bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); count = dmz_clear_bits((unsigned long *)mblk->data, bit, nr_bits); @@ -2173,6 +2185,7 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, { struct dmz_mblock *mblk; unsigned int bit, set_bit, nr_bits; + unsigned int zone_bits = zmd->zone_bits_per_mblk; unsigned long *bitmap; int n = 0; @@ -2187,15 +2200,15 @@ static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone, /* Get offset */ bitmap = (unsigned long *) mblk->data; bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zone_bits - bit); if (set) - set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); + set_bit = find_next_bit(bitmap, zone_bits, bit); else - set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit); + set_bit = find_next_zero_bit(bitmap, zone_bits, bit); dmz_release_mblock(zmd, mblk); n += set_bit - bit; - if (set_bit < DMZ_BLOCK_SIZE_BITS) + if (set_bit < zone_bits) break; nr_blocks -= nr_bits; @@ -2298,7 +2311,7 @@ static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone) /* Count bits in this block */ bitmap = mblk->data; bit = chunk_block & DMZ_BLOCK_MASK_BITS; - nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit); + nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit); n += dmz_count_bits(bitmap, bit, nr_bits); dmz_release_mblock(zmd, mblk); diff --git a/drivers/md/dm-zoned-reclaim.c b/drivers/md/dm-zoned-reclaim.c index 9470b8f77a33..879848aad97a 100644 --- a/drivers/md/dm-zoned-reclaim.c +++ b/drivers/md/dm-zoned-reclaim.c @@ -81,6 +81,7 @@ static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", dmz_id(zmd, zone), (unsigned long long)wp_block, (unsigned long long)block, nr_blocks, ret); + dmz_check_bdev(zrc->dev); return ret; } @@ -347,8 +348,8 @@ static int dmz_do_reclaim(struct dmz_reclaim *zrc) /* Get a data zone */ dzone = dmz_get_zone_for_reclaim(zmd); - if (IS_ERR(dzone)) - return PTR_ERR(dzone); + if (!dzone) + return -EBUSY; start = jiffies; @@ -488,12 +489,7 @@ static void dmz_reclaim_work(struct work_struct *work) ret = dmz_do_reclaim(zrc); if (ret) { dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); - if (ret == -EIO) - /* - * LLD might be performing some error handling sequence - * at the underlying device. To not interfere, do not - * attempt to schedule the next reclaim run immediately. - */ + if (!dmz_check_bdev(zrc->dev)) return; } diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 3334f5865de7..159d27aa712b 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -79,6 +79,8 @@ static inline void dmz_bio_endio(struct bio *bio, blk_status_t status) if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK) bio->bi_status = status; + if (bio->bi_status != BLK_STS_OK) + bioctx->target->dev->flags |= DMZ_CHECK_BDEV; if (refcount_dec_and_test(&bioctx->ref)) { struct dm_zone *zone = bioctx->zone; @@ -564,32 +566,52 @@ out: } /* - * Check the backing device availability. If it's on the way out, + * Check if the backing device is being removed. If it's on the way out, * start failing I/O. Reclaim and metadata components also call this * function to cleanly abort operation in the event of such failure. */ bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev) { - struct gendisk *disk; + if (dmz_dev->flags & DMZ_BDEV_DYING) + return true; - if (!(dmz_dev->flags & DMZ_BDEV_DYING)) { - disk = dmz_dev->bdev->bd_disk; - if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { - dmz_dev_warn(dmz_dev, "Backing device queue dying"); - dmz_dev->flags |= DMZ_BDEV_DYING; - } else if (disk->fops->check_events) { - if (disk->fops->check_events(disk, 0) & - DISK_EVENT_MEDIA_CHANGE) { - dmz_dev_warn(dmz_dev, "Backing device offline"); - dmz_dev->flags |= DMZ_BDEV_DYING; - } - } + if (dmz_dev->flags & DMZ_CHECK_BDEV) + return !dmz_check_bdev(dmz_dev); + + if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) { + dmz_dev_warn(dmz_dev, "Backing device queue dying"); + dmz_dev->flags |= DMZ_BDEV_DYING; } return dmz_dev->flags & DMZ_BDEV_DYING; } /* + * Check the backing device availability. This detects such events as + * backing device going offline due to errors, media removals, etc. + * This check is less efficient than dmz_bdev_is_dying() and should + * only be performed as a part of error handling. + */ +bool dmz_check_bdev(struct dmz_dev *dmz_dev) +{ + struct gendisk *disk; + + dmz_dev->flags &= ~DMZ_CHECK_BDEV; + + if (dmz_bdev_is_dying(dmz_dev)) + return false; + + disk = dmz_dev->bdev->bd_disk; + if (disk->fops->check_events && + disk->fops->check_events(disk, 0) & DISK_EVENT_MEDIA_CHANGE) { + dmz_dev_warn(dmz_dev, "Backing device offline"); + dmz_dev->flags |= DMZ_BDEV_DYING; + } + + return !(dmz_dev->flags & DMZ_BDEV_DYING); +} + +/* * Process a new BIO. */ static int dmz_map(struct dm_target *ti, struct bio *bio) @@ -767,7 +789,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) } /* Set target (no write same support) */ - ti->max_io_len = dev->zone_nr_sectors << 9; + ti->max_io_len = dev->zone_nr_sectors; ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_zeroes_bios = 1; @@ -901,8 +923,8 @@ static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev) { struct dmz_target *dmz = ti->private; - if (dmz_bdev_is_dying(dmz->dev)) - return -ENODEV; + if (!dmz_check_bdev(dmz->dev)) + return -EIO; *bdev = dmz->dev->bdev; diff --git a/drivers/md/dm-zoned.h b/drivers/md/dm-zoned.h index 93a64529f219..2662746ba8b9 100644 --- a/drivers/md/dm-zoned.h +++ b/drivers/md/dm-zoned.h @@ -71,6 +71,7 @@ struct dmz_dev { /* Device flags. */ #define DMZ_BDEV_DYING (1 << 0) +#define DMZ_CHECK_BDEV (2 << 0) /* * Zone descriptor. @@ -254,5 +255,6 @@ void dmz_schedule_reclaim(struct dmz_reclaim *zrc); * Functions defined in dm-zoned-target.c */ bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev); +bool dmz_check_bdev(struct dmz_dev *dmz_dev); #endif /* DM_ZONED_H */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5475081dcbd6..9e7765ea73a9 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -12,6 +12,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> +#include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/blkpg.h> #include <linux/bio.h> @@ -140,6 +141,7 @@ EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr); #define DMF_NOFLUSH_SUSPENDING 5 #define DMF_DEFERRED_REMOVE 6 #define DMF_SUSPENDED_INTERNALLY 7 +#define DMF_POST_SUSPENDING 8 #define DM_NUMA_NODE NUMA_NO_NODE static int dm_numa_node = DM_NUMA_NODE; @@ -1438,9 +1440,6 @@ static int __send_empty_flush(struct clone_info *ci) BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); - - bio_disassociate_blkg(ci->bio); - return 0; } @@ -1628,6 +1627,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); + bio_uninit(ci.bio); /* dec_pending submits any data associated with flush */ } else if (bio_op(bio) == REQ_OP_ZONE_RESET) { ci.bio = bio; @@ -1702,6 +1702,7 @@ static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map, ci.bio = &flush_bio; ci.sector_count = 0; error = __send_empty_flush(&ci); + bio_uninit(ci.bio); /* dec_pending submits any data associated with flush */ } else { struct dm_target_io *tio; @@ -1762,8 +1763,9 @@ static blk_qc_t dm_process_bio(struct mapped_device *md, * won't be imposed. */ if (current->bio_list) { - blk_queue_split(md->queue, &bio); - if (!is_abnormal_io(bio)) + if (is_abnormal_io(bio)) + blk_queue_split(md->queue, &bio); + else dm_queue_split(md, ti, &bio); } @@ -1811,7 +1813,8 @@ static int dm_any_congested(void *congested_data, int bdi_bits) * With request-based DM we only need to check the * top-level queue for congestion. */ - r = md->queue->backing_dev_info->wb.state & bdi_bits; + struct backing_dev_info *bdi = md->queue->backing_dev_info; + r = bdi->wb.congested->state & bdi_bits; } else { map = dm_get_live_table_fast(md); if (map) @@ -1877,14 +1880,6 @@ static const struct dax_operations dm_dax_ops; static void dm_wq_work(struct work_struct *work); -static void dm_init_normal_md_queue(struct mapped_device *md) -{ - /* - * Initialize aspects of queue that aren't relevant for blk-mq - */ - md->queue->backing_dev_info->congested_fn = dm_any_congested; -} - static void cleanup_mapped_device(struct mapped_device *md) { if (md->wq) @@ -1972,7 +1967,12 @@ static struct mapped_device *alloc_dev(int minor) if (!md->queue) goto bad; md->queue->queuedata = md; - md->queue->backing_dev_info->congested_data = md; + /* + * default to bio-based required ->make_request_fn until DM + * table is loaded and md->type established. If request-based + * table is loaded: blk-mq will override accordingly. + */ + blk_queue_make_request(md->queue, dm_make_request); md->disk = alloc_disk_node(1, md->numa_node_id); if (!md->disk) @@ -2265,6 +2265,12 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_get_queue_limits); +static void dm_init_congested_fn(struct mapped_device *md) +{ + md->queue->backing_dev_info->congested_data = md; + md->queue->backing_dev_info->congested_fn = dm_any_congested; +} + /* * Setup the DM device's queue based on md's type */ @@ -2281,12 +2287,12 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) DMERR("Cannot initialize queue for request-based dm-mq mapped device"); return r; } + dm_init_congested_fn(md); break; case DM_TYPE_BIO_BASED: case DM_TYPE_DAX_BIO_BASED: case DM_TYPE_NVME_BIO_BASED: - dm_init_normal_md_queue(md); - blk_queue_make_request(md->queue, dm_make_request); + dm_init_congested_fn(md); break; case DM_TYPE_NONE: WARN_ON_ONCE(true); @@ -2385,6 +2391,8 @@ static void __dm_destroy(struct mapped_device *md, bool wait) map = dm_get_live_table(md, &srcu_idx); if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); + set_bit(DMF_SUSPENDED, &md->flags); + set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); } /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ @@ -2707,7 +2715,9 @@ retry: if (r) goto out_unlock; + set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); + clear_bit(DMF_POST_SUSPENDING, &md->flags); out_unlock: mutex_unlock(&md->suspend_lock); @@ -2804,7 +2814,9 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, DMF_SUSPENDED_INTERNALLY); + set_bit(DMF_POST_SUSPENDING, &md->flags); dm_table_postsuspend_targets(map); + clear_bit(DMF_POST_SUSPENDING, &md->flags); } static void __dm_internal_resume(struct mapped_device *md) @@ -2881,17 +2893,25 @@ EXPORT_SYMBOL_GPL(dm_internal_resume_fast); int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, unsigned cookie) { + int r; + unsigned noio_flag; char udev_cookie[DM_COOKIE_LENGTH]; char *envp[] = { udev_cookie, NULL }; + noio_flag = memalloc_noio_save(); + if (!cookie) - return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); + r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action); else { snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", DM_COOKIE_ENV_VAR_NAME, cookie); - return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, - action, envp); + r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj, + action, envp); } + + memalloc_noio_restore(noio_flag); + + return r; } uint32_t dm_next_uevent_seq(struct mapped_device *md) @@ -2957,6 +2977,11 @@ int dm_suspended_md(struct mapped_device *md) return test_bit(DMF_SUSPENDED, &md->flags); } +static int dm_post_suspending_md(struct mapped_device *md) +{ + return test_bit(DMF_POST_SUSPENDING, &md->flags); +} + int dm_suspended_internally_md(struct mapped_device *md) { return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); @@ -2973,6 +2998,12 @@ int dm_suspended(struct dm_target *ti) } EXPORT_SYMBOL_GPL(dm_suspended); +int dm_post_suspending(struct dm_target *ti) +{ + return dm_post_suspending_md(dm_table_get_md(ti->table)); +} +EXPORT_SYMBOL_GPL(dm_post_suspending); + int dm_noflush_suspending(struct dm_target *ti) { return __noflush_suspending(dm_table_get_md(ti->table)); diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index c01d41198f5e..ddaccaa15329 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2133,6 +2133,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), sizeof(bitmap_super_t)); + spin_lock_irq(&bitmap->counts.lock); md_bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; @@ -2148,7 +2149,6 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, blocks = min(old_counts.chunks << old_counts.chunkshift, chunks << chunkshift); - spin_lock_irq(&bitmap->counts.lock); /* For cluster raid, need to pre-allocate bitmap */ if (mddev_is_clustered(bitmap->mddev)) { unsigned long page; diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 813a99ffa86f..d50737ec4039 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1139,6 +1139,7 @@ static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsiz bitmap = get_bitmap_from_slot(mddev, i); if (IS_ERR(bitmap)) { pr_err("can't get bitmap from slot %d\n", i); + bitmap = NULL; goto out; } counts = &bitmap->counts; @@ -1518,6 +1519,7 @@ static void unlock_all_bitmaps(struct mddev *mddev) } } kfree(cinfo->other_bitmap_lockres); + cinfo->other_bitmap_lockres = NULL; } } diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 7354466ddc90..afcf1d388300 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -244,10 +244,9 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) sector_t start_sector, end_sector, data_offset; sector_t bio_sector = bio->bi_iter.bi_sector; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } tmp_dev = which_dev(mddev, bio_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 6780938d2991..152f9e65a226 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -104,10 +104,9 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) struct multipath_bh * mp_bh; struct multipath_info *multipath; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); diff --git a/drivers/md/md.c b/drivers/md/md.c index c29002d8f337..1dbfe8213575 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -473,7 +473,13 @@ static void md_submit_flush_data(struct work_struct *ws) } } -void md_flush_request(struct mddev *mddev, struct bio *bio) +/* + * Manages consolidation of flushes and submitting any flushes needed for + * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is + * being finished in another context. Returns false if the flushing is + * complete but still needs the I/O portion of the bio to be processed. + */ +bool md_flush_request(struct mddev *mddev, struct bio *bio) { ktime_t start = ktime_get_boottime(); spin_lock_irq(&mddev->lock); @@ -498,9 +504,10 @@ void md_flush_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); else { bio->bi_opf &= ~REQ_PREFLUSH; - mddev->pers->make_request(mddev, bio); + return false; } } + return true; } EXPORT_SYMBOL(md_flush_request); @@ -1021,6 +1028,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; mdp_super_t *sb; int ret; + bool spare_disk = true; /* * Calculate the position of the superblock (512byte sectors), @@ -1071,8 +1079,19 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor else rdev->desc_nr = sb->this_disk.number; + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == LEVEL_MULTIPATH || + (rdev->desc_nr >= 0 && + rdev->desc_nr < MD_SB_DISKS && + sb->disks[rdev->desc_nr].state & + ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))) + spare_disk = false; + if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; mdp_super_t *refsb = page_address(refdev->sb_page); @@ -1088,7 +1107,8 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor } ev1 = md_event(sb); ev2 = md_event(refsb); - if (ev1 > ev2) + + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; @@ -1160,6 +1180,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_layout = mddev->layout; mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0) + mddev->layout = -1; if (sb->state & (1<<MD_SB_CLEAN)) mddev->recovery_cp = MaxSector; @@ -1446,6 +1468,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ sector_t sectors; char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; int bmask; + bool spare_disk = true; /* * Calculate the position of the superblock in 512byte sectors. @@ -1575,8 +1598,23 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; } + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) && + sb->level != 0) + return -EINVAL; + + /* not spare disk, or LEVEL_MULTIPATH */ + if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) || + (rdev->desc_nr >= 0 && + rdev->desc_nr < le32_to_cpu(sb->max_dev) && + (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || + le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))) + spare_disk = false; + if (!refdev) { - ret = 1; + if (!spare_disk) + ret = 1; + else + ret = 0; } else { __u64 ev1, ev2; struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); @@ -1593,7 +1631,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_ ev1 = le64_to_cpu(sb->events); ev2 = le64_to_cpu(refsb->events); - if (ev1 > ev2) + if (!spare_disk && ev1 > ev2) ret = 1; else ret = 0; @@ -1685,6 +1723,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) mddev->new_chunk_sectors = mddev->chunk_sectors; } + if (mddev->level == 0 && + !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT)) + mddev->layout = -1; + if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) set_bit(MD_HAS_JOURNAL, &mddev->flags); @@ -3504,7 +3546,7 @@ abort_free: * Check a full RAID array for plausibility */ -static void analyze_sbs(struct mddev *mddev) +static int analyze_sbs(struct mddev *mddev) { int i; struct md_rdev *rdev, *freshest, *tmp; @@ -3525,6 +3567,12 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); } + /* Cannot find a valid fresh disk */ + if (!freshest) { + pr_warn("md: cannot find a valid disk\n"); + return -EINVAL; + } + super_types[mddev->major_version]. validate_super(mddev, freshest); @@ -3559,6 +3607,8 @@ static void analyze_sbs(struct mddev *mddev) clear_bit(In_sync, &rdev->flags); } } + + return 0; } /* Read a fixed-point number. @@ -5442,7 +5492,9 @@ int md_run(struct mddev *mddev) if (!mddev->raid_disks) { if (!mddev->persistent) return -EINVAL; - analyze_sbs(mddev); + err = analyze_sbs(mddev); + if (err) + return -EINVAL; } if (mddev->level != LEVEL_NONE) @@ -5849,7 +5901,7 @@ EXPORT_SYMBOL_GPL(md_stop_writes); static void mddev_detach(struct mddev *mddev) { md_bitmap_wait_behind_writes(mddev); - if (mddev->pers && mddev->pers->quiesce) { + if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) { mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } @@ -6759,6 +6811,9 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->external = 0; mddev->layout = info->layout; + if (mddev->level == 0) + /* Cannot trust RAID0 layout info here */ + mddev->layout = -1; mddev->chunk_sectors = info->chunk_size >> 9; if (mddev->persistent) { @@ -7413,7 +7468,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) */ mddev_put(mddev); /* Wait until bdev->bd_disk is definitely gone */ - flush_workqueue(md_misc_wq); + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); /* Then retry the open from the top */ return -ERESTARTSYS; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 6ad1f93a8c17..48498d210143 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -525,7 +525,7 @@ struct md_personality int level; struct list_head list; struct module *owner; - bool (*make_request)(struct mddev *mddev, struct bio *bio); + bool __must_check (*make_request)(struct mddev *mddev, struct bio *bio); /* * start up works that do NOT require md_thread. tasks that * requires md_thread should go into start() @@ -678,7 +678,7 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); extern int mddev_congested(struct mddev *mddev, int bits); -extern void md_flush_request(struct mddev *mddev, struct bio *bio); +extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, sector_t sector, int size, struct page *page); extern int md_super_wait(struct mddev *mddev); diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 21ea537bd55e..eff04fa23dfa 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c @@ -203,7 +203,13 @@ static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, struct btree_node *right = r->n; uint32_t nr_left = le32_to_cpu(left->header.nr_entries); uint32_t nr_right = le32_to_cpu(right->header.nr_entries); - unsigned threshold = 2 * merge_threshold(left) + 1; + /* + * Ensure the number of entries in each child will be greater + * than or equal to (max_entries / 3 + 1), so no matter which + * child is used for removal, the number will still be not + * less than (max_entries / 3). + */ + unsigned int threshold = 2 * (merge_threshold(left) + 1); if (nr_left + nr_right < threshold) { /* diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index b8a62188f6be..f4fffe5e4f20 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c @@ -384,6 +384,33 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, return -ENOSPC; } +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, + dm_block_t begin, dm_block_t end, dm_block_t *b) +{ + int r; + uint32_t count; + + do { + r = sm_ll_find_free_block(new_ll, begin, new_ll->nr_blocks, b); + if (r) + break; + + /* double check this block wasn't used in the old transaction */ + if (*b >= old_ll->nr_blocks) + count = 0; + else { + r = sm_ll_lookup(old_ll, *b, &count); + if (r) + break; + + if (count) + begin = *b + 1; + } + } while (count); + + return r; +} + static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b, int (*mutator)(void *context, uint32_t old, uint32_t *new), void *context, enum allocation_event *ev) diff --git a/drivers/md/persistent-data/dm-space-map-common.h b/drivers/md/persistent-data/dm-space-map-common.h index b3078d5eda0c..8de63ce39bdd 100644 --- a/drivers/md/persistent-data/dm-space-map-common.h +++ b/drivers/md/persistent-data/dm-space-map-common.h @@ -109,6 +109,8 @@ int sm_ll_lookup_bitmap(struct ll_disk *ll, dm_block_t b, uint32_t *result); int sm_ll_lookup(struct ll_disk *ll, dm_block_t b, uint32_t *result); int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin, dm_block_t end, dm_block_t *result); +int sm_ll_find_common_free_block(struct ll_disk *old_ll, struct ll_disk *new_ll, + dm_block_t begin, dm_block_t end, dm_block_t *result); int sm_ll_insert(struct ll_disk *ll, dm_block_t b, uint32_t ref_count, enum allocation_event *ev); int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev); diff --git a/drivers/md/persistent-data/dm-space-map-disk.c b/drivers/md/persistent-data/dm-space-map-disk.c index 32adf6b4a9c7..bf4c5e2ccb6f 100644 --- a/drivers/md/persistent-data/dm-space-map-disk.c +++ b/drivers/md/persistent-data/dm-space-map-disk.c @@ -167,8 +167,10 @@ static int sm_disk_new_block(struct dm_space_map *sm, dm_block_t *b) enum allocation_event ev; struct sm_disk *smd = container_of(sm, struct sm_disk, sm); - /* FIXME: we should loop round a couple of times */ - r = sm_ll_find_free_block(&smd->old_ll, smd->begin, smd->old_ll.nr_blocks, b); + /* + * Any block we allocate has to be free in both the old and current ll. + */ + r = sm_ll_find_common_free_block(&smd->old_ll, &smd->ll, smd->begin, smd->ll.nr_blocks, b); if (r) return r; diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index 25328582cc48..9e3c64ec2026 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -448,7 +448,10 @@ static int sm_metadata_new_block_(struct dm_space_map *sm, dm_block_t *b) enum allocation_event ev; struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - r = sm_ll_find_free_block(&smm->old_ll, smm->begin, smm->old_ll.nr_blocks, b); + /* + * Any block we allocate has to be free in both the old and current ll. + */ + r = sm_ll_find_common_free_block(&smm->old_ll, &smm->ll, smm->begin, smm->ll.nr_blocks, b); if (r) return r; diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c3445d2cedb9..02dc6b9c16f9 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -87,7 +87,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - unsigned short blksize = 512; + unsigned blksize = 512; *private_conf = ERR_PTR(-ENOMEM); if (!conf) @@ -145,6 +145,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) if (conf->nr_strip_zones == 1) { conf->layout = RAID0_ORIG_LAYOUT; + } else if (mddev->layout == RAID0_ORIG_LAYOUT || + mddev->layout == RAID0_ALT_MULTIZONE_LAYOUT) { + conf->layout = mddev->layout; } else if (default_layout == RAID0_ORIG_LAYOUT || default_layout == RAID0_ALT_MULTIZONE_LAYOUT) { conf->layout = default_layout; @@ -572,10 +575,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) unsigned chunk_sects; unsigned sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { raid0_handle_discard(mddev, bio); @@ -612,7 +614,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) tmp_dev = map_sector(mddev, zone, sector, §or); break; default: - WARN("md/raid0:%s: Invalid layout\n", mdname(mddev)); + WARN(1, "md/raid0:%s: Invalid layout\n", mdname(mddev)); bio_io_error(bio); return true; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f393f0dc042f..7ad019202caf 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1529,10 +1529,9 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio) { sector_t sectors; - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } /* * There is a limit to the maximum size, but @@ -2748,7 +2747,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, write_targets++; } } - if (bio->bi_end_io) { + if (rdev && bio->bi_end_io) { atomic_inc(&rdev->nr_pending); bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; bio_set_dev(bio, rdev->bdev); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 57e2a4e8659a..664930238aaf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1532,10 +1532,9 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) int chunk_sects = chunk_mask + 1; int sectors = bio_sectors(bio); - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); + if (unlikely(bio->bi_opf & REQ_PREFLUSH) + && md_flush_request(mddev, bio)) return true; - } if (!md_write_start(mddev, bio)) return false; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 90380064afc7..d0af52da65af 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2228,14 +2228,19 @@ static int grow_stripes(struct r5conf *conf, int num) * of the P and Q blocks. */ static int scribble_alloc(struct raid5_percpu *percpu, - int num, int cnt, gfp_t flags) + int num, int cnt) { size_t obj_size = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); void *scribble; - scribble = kvmalloc_array(cnt, obj_size, flags); + /* + * If here is in raid array suspend context, it is in memalloc noio + * context as well, there is no potential recursive memory reclaim + * I/Os with the GFP_KERNEL flag. + */ + scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL); if (!scribble) return -ENOMEM; @@ -2267,8 +2272,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) percpu = per_cpu_ptr(conf->percpu, cpu); err = scribble_alloc(percpu, new_disks, - new_sectors / STRIPE_SECTORS, - GFP_NOIO); + new_sectors / STRIPE_SECTORS); if (err) break; } @@ -3594,6 +3598,7 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, * is missing/faulty, then we need to read everything we can. */ if (sh->raid_conf->level != 6 && + sh->raid_conf->rmw_level != PARITY_DISABLE_RMW && sh->sector < sh->raid_conf->mddev->recovery_cp) /* reconstruct-write isn't being forced */ return 0; @@ -4830,7 +4835,7 @@ static void handle_stripe(struct stripe_head *sh) * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) + || (s.to_write && s.failed) || (s.syncing && (s.uptodate + s.compute < disks)) || s.replacing || s.expanding) @@ -5588,8 +5593,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (ret == 0) return true; if (ret == -ENODEV) { - md_flush_request(mddev, bi); - return true; + if (md_flush_request(mddev, bi)) + return true; } /* ret == -EAGAIN, fallback */ /* @@ -5722,7 +5727,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = false; } - if (!sh->batch_head) + if (!sh->batch_head || sh == sh->batch_head) set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && @@ -6761,8 +6766,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu conf->previous_raid_disks), max(conf->chunk_sectors, conf->prev_chunk_sectors) - / STRIPE_SECTORS, - GFP_KERNEL)) { + / STRIPE_SECTORS)) { free_scratch_buffer(conf, percpu); return -ENOMEM; } |