aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/alloc.c35
-rw-r--r--drivers/md/bcache/bcache.h5
-rw-r--r--drivers/md/bcache/btree.c21
-rw-r--r--drivers/md/bcache/super.c10
-rw-r--r--drivers/md/bcache/sysfs.c2
-rw-r--r--drivers/md/bcache/writeback.c73
-rw-r--r--drivers/md/dm-cache-metadata.c54
-rw-r--r--drivers/md/dm-cache-policy-smq.c28
-rw-r--r--drivers/md/dm-cache-target.c15
-rw-r--r--drivers/md/dm-clone-target.c2
-rw-r--r--drivers/md/dm-core.h2
-rw-r--r--drivers/md/dm-crypt.c11
-rw-r--r--drivers/md/dm-delay.c17
-rw-r--r--drivers/md/dm-flakey.c34
-rw-r--r--drivers/md/dm-integrity.c34
-rw-r--r--drivers/md/dm-io.c2
-rw-r--r--drivers/md/dm-ioctl.c16
-rw-r--r--drivers/md/dm-raid.c19
-rw-r--r--drivers/md/dm-snap-persistent.c2
-rw-r--r--drivers/md/dm-snap.c4
-rw-r--r--drivers/md/dm-stats.c7
-rw-r--r--drivers/md/dm-stats.h2
-rw-r--r--drivers/md/dm-table.c11
-rw-r--r--drivers/md/dm-thin-metadata.c67
-rw-r--r--drivers/md/dm-thin.c26
-rw-r--r--drivers/md/dm-verity-fec.c3
-rw-r--r--drivers/md/dm-verity-target.c21
-rw-r--r--drivers/md/dm-verity.h10
-rw-r--r--drivers/md/dm-writecache.c2
-rw-r--r--drivers/md/dm.c36
-rw-r--r--drivers/md/md-bitmap.c90
-rw-r--r--drivers/md/md.c90
-rw-r--r--drivers/md/raid0.c66
-rw-r--r--drivers/md/raid0.h1
-rw-r--r--drivers/md/raid1.c4
-rw-r--r--drivers/md/raid10.c139
-rw-r--r--drivers/md/raid5.c28
37 files changed, 697 insertions, 292 deletions
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a1df0d95151c..5310e1f4a282 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -49,7 +49,7 @@
*
* bch_bucket_alloc() allocates a single bucket from a specific cache.
*
- * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * bch_bucket_alloc_set() allocates one bucket from different caches
* out of a cache set.
*
* free_some_buckets() drives all the processes described above. It's called
@@ -488,34 +488,29 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k)
}
int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
- struct bkey *k, int n, bool wait)
+ struct bkey *k, bool wait)
{
- int i;
+ struct cache *ca;
+ long b;
/* No allocation if CACHE_SET_IO_DISABLE bit is set */
if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags)))
return -1;
lockdep_assert_held(&c->bucket_lock);
- BUG_ON(!n || n > c->caches_loaded || n > MAX_CACHES_PER_SET);
bkey_init(k);
- /* sort by free space/prio of oldest data in caches */
-
- for (i = 0; i < n; i++) {
- struct cache *ca = c->cache_by_alloc[i];
- long b = bch_bucket_alloc(ca, reserve, wait);
+ ca = c->cache_by_alloc[0];
+ b = bch_bucket_alloc(ca, reserve, wait);
+ if (b == -1)
+ goto err;
- if (b == -1)
- goto err;
+ k->ptr[0] = MAKE_PTR(ca->buckets[b].gen,
+ bucket_to_sector(c, b),
+ ca->sb.nr_this_dev);
- k->ptr[i] = MAKE_PTR(ca->buckets[b].gen,
- bucket_to_sector(c, b),
- ca->sb.nr_this_dev);
-
- SET_KEY_PTRS(k, i + 1);
- }
+ SET_KEY_PTRS(k, 1);
return 0;
err:
@@ -525,12 +520,12 @@ err:
}
int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
- struct bkey *k, int n, bool wait)
+ struct bkey *k, bool wait)
{
int ret;
mutex_lock(&c->bucket_lock);
- ret = __bch_bucket_alloc_set(c, reserve, k, n, wait);
+ ret = __bch_bucket_alloc_set(c, reserve, k, wait);
mutex_unlock(&c->bucket_lock);
return ret;
}
@@ -638,7 +633,7 @@ bool bch_alloc_sectors(struct cache_set *c,
spin_unlock(&c->data_bucket_lock);
- if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, wait))
+ if (bch_bucket_alloc_set(c, watermark, &alloc.key, wait))
return false;
spin_lock(&c->data_bucket_lock);
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 36de6f7ddf22..7bce58278845 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -265,6 +265,7 @@ struct bcache_device {
#define BCACHE_DEV_WB_RUNNING 3
#define BCACHE_DEV_RATE_DW_RUNNING 4
int nr_stripes;
+#define BCH_MIN_STRIPE_SZ ((4 << 20) >> SECTOR_SHIFT)
unsigned int stripe_size;
atomic_t *stripe_sectors_dirty;
unsigned long *full_dirty_stripes;
@@ -970,9 +971,9 @@ void bch_bucket_free(struct cache_set *c, struct bkey *k);
long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait);
int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
- struct bkey *k, int n, bool wait);
+ struct bkey *k, bool wait);
int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
- struct bkey *k, int n, bool wait);
+ struct bkey *k, bool wait);
bool bch_alloc_sectors(struct cache_set *c, struct bkey *k,
unsigned int sectors, unsigned int write_point,
unsigned int write_prio, bool wait);
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 5a33910aea78..5db893d6a824 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1020,6 +1020,9 @@ err:
*
* The btree node will have either a read or a write lock held, depending on
* level and op->lock.
+ *
+ * Note: Only error code or btree pointer will be returned, it is unncessary
+ * for callers to check NULL pointer.
*/
struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
struct bkey *k, int level, bool write,
@@ -1132,16 +1135,22 @@ retry:
mutex_unlock(&b->c->bucket_lock);
}
+/*
+ * Only error code or btree pointer will be returned, it is unncessary for
+ * callers to check NULL pointer.
+ */
struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
int level, bool wait,
struct btree *parent)
{
BKEY_PADDED(key) k;
- struct btree *b = ERR_PTR(-EAGAIN);
+ struct btree *b;
mutex_lock(&c->bucket_lock);
retry:
- if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
+ /* return ERR_PTR(-EAGAIN) when it fails */
+ b = ERR_PTR(-EAGAIN);
+ if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
goto err;
bkey_put(c, &k.key);
@@ -1186,7 +1195,7 @@ static struct btree *btree_node_alloc_replacement(struct btree *b,
{
struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
- if (!IS_ERR_OR_NULL(n)) {
+ if (!IS_ERR(n)) {
mutex_lock(&n->write_lock);
bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
bkey_copy_key(&n->key, &b->key);
@@ -1401,7 +1410,7 @@ static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
for (i = 0; i < nodes; i++) {
new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
- if (IS_ERR_OR_NULL(new_nodes[i]))
+ if (IS_ERR(new_nodes[i]))
goto out_nocoalesce;
}
@@ -1553,6 +1562,8 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
return 0;
n = btree_node_alloc_replacement(replace, NULL);
+ if (IS_ERR(n))
+ return 0;
/* recheck reserve after allocating replacement node */
if (btree_check_reserve(b, NULL)) {
@@ -1718,7 +1729,7 @@ static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
if (should_rewrite) {
n = btree_node_alloc_replacement(b, NULL);
- if (!IS_ERR_OR_NULL(n)) {
+ if (!IS_ERR(n)) {
bch_btree_node_write_sync(n);
bch_btree_set_root(n);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index efdf6ce0443e..5d1a4eb81669 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -428,7 +428,7 @@ static int __uuid_write(struct cache_set *c)
closure_init_stack(&cl);
lockdep_assert_held(&bch_register_lock);
- if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
+ if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
return 1;
SET_KEY_SIZE(&k.key, c->sb.bucket_size);
@@ -822,6 +822,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
if (!d->stripe_size)
d->stripe_size = 1 << 31;
+ else if (d->stripe_size < BCH_MIN_STRIPE_SZ)
+ d->stripe_size = roundup(BCH_MIN_STRIPE_SZ, d->stripe_size);
n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
if (!n || n > max_stripes) {
@@ -1633,7 +1635,7 @@ static void cache_set_flush(struct closure *cl)
if (!IS_ERR_OR_NULL(c->gc_thread))
kthread_stop(c->gc_thread);
- if (!IS_ERR_OR_NULL(c->root))
+ if (!IS_ERR(c->root))
list_add(&c->root->list, &c->btree_cache);
/*
@@ -1906,7 +1908,7 @@ static int run_cache_set(struct cache_set *c)
c->root = bch_btree_node_get(c, NULL, k,
j->btree_level,
true, NULL);
- if (IS_ERR_OR_NULL(c->root))
+ if (IS_ERR(c->root))
goto err;
list_del_init(&c->root->list);
@@ -2000,7 +2002,7 @@ static int run_cache_set(struct cache_set *c)
err = "cannot allocate new btree root";
c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
- if (IS_ERR_OR_NULL(c->root))
+ if (IS_ERR(c->root))
goto err;
mutex_lock(&c->root->write_lock);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 7f0fb4b5755a..30073e4074c1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -1057,7 +1057,7 @@ SHOW(__bch_cache)
sum += INITIAL_PRIO - cached[i];
if (n)
- do_div(sum, n);
+ sum = div64_u64(sum, n);
for (i = 0; i < ARRAY_SIZE(q); i++)
q[i] = INITIAL_PRIO - cached[n * (i + 1) /
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 0b02210ab435..5767ff6c13e3 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -119,27 +119,61 @@ static void __update_writeback_rate(struct cached_dev *dc)
dc->writeback_rate_target = target;
}
+static bool idle_counter_exceeded(struct cache_set *c)
+{
+ int counter, dev_nr;
+
+ /*
+ * If c->idle_counter is overflow (idel for really long time),
+ * reset as 0 and not set maximum rate this time for code
+ * simplicity.
+ */
+ counter = atomic_inc_return(&c->idle_counter);
+ if (counter <= 0) {
+ atomic_set(&c->idle_counter, 0);
+ return false;
+ }
+
+ dev_nr = atomic_read(&c->attached_dev_nr);
+ if (dev_nr == 0)
+ return false;
+
+ /*
+ * c->idle_counter is increased by writeback thread of all
+ * attached backing devices, in order to represent a rough
+ * time period, counter should be divided by dev_nr.
+ * Otherwise the idle time cannot be larger with more backing
+ * device attached.
+ * The following calculation equals to checking
+ * (counter / dev_nr) < (dev_nr * 6)
+ */
+ if (counter < (dev_nr * dev_nr * 6))
+ return false;
+
+ return true;
+}
+
+/*
+ * Idle_counter is increased every time when update_writeback_rate() is
+ * called. If all backing devices attached to the same cache set have
+ * identical dc->writeback_rate_update_seconds values, it is about 6
+ * rounds of update_writeback_rate() on each backing device before
+ * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
+ * to each dc->writeback_rate.rate.
+ * In order to avoid extra locking cost for counting exact dirty cached
+ * devices number, c->attached_dev_nr is used to calculate the idle
+ * throushold. It might be bigger if not all cached device are in write-
+ * back mode, but it still works well with limited extra rounds of
+ * update_writeback_rate().
+ */
static bool set_at_max_writeback_rate(struct cache_set *c,
struct cached_dev *dc)
{
/* Don't set max writeback rate if gc is running */
if (!c->gc_mark_valid)
return false;
- /*
- * Idle_counter is increased everytime when update_writeback_rate() is
- * called. If all backing devices attached to the same cache set have
- * identical dc->writeback_rate_update_seconds values, it is about 6
- * rounds of update_writeback_rate() on each backing device before
- * c->at_max_writeback_rate is set to 1, and then max wrteback rate set
- * to each dc->writeback_rate.rate.
- * In order to avoid extra locking cost for counting exact dirty cached
- * devices number, c->attached_dev_nr is used to calculate the idle
- * throushold. It might be bigger if not all cached device are in write-
- * back mode, but it still works well with limited extra rounds of
- * update_writeback_rate().
- */
- if (atomic_inc_return(&c->idle_counter) <
- atomic_read(&c->attached_dev_nr) * 6)
+
+ if (!idle_counter_exceeded(c))
return false;
if (atomic_read(&c->at_max_writeback_rate) != 1)
@@ -153,13 +187,10 @@ static bool set_at_max_writeback_rate(struct cache_set *c,
dc->writeback_rate_change = 0;
/*
- * Check c->idle_counter and c->at_max_writeback_rate agagain in case
- * new I/O arrives during before set_at_max_writeback_rate() returns.
- * Then the writeback rate is set to 1, and its new value should be
- * decided via __update_writeback_rate().
+ * In case new I/O arrives during before
+ * set_at_max_writeback_rate() returns.
*/
- if ((atomic_read(&c->idle_counter) <
- atomic_read(&c->attached_dev_nr) * 6) ||
+ if (!idle_counter_exceeded(c) ||
!atomic_read(&c->at_max_writeback_rate))
return false;
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index af6d4f898e4c..2ecd0db0f294 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -551,11 +551,13 @@ static int __create_persistent_data_objects(struct dm_cache_metadata *cmd,
return r;
}
-static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd)
+static void __destroy_persistent_data_objects(struct dm_cache_metadata *cmd,
+ bool destroy_bm)
{
dm_sm_destroy(cmd->metadata_sm);
dm_tm_destroy(cmd->tm);
- dm_block_manager_destroy(cmd->bm);
+ if (destroy_bm)
+ dm_block_manager_destroy(cmd->bm);
}
typedef unsigned long (*flags_mutator)(unsigned long);
@@ -826,7 +828,7 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev,
cmd2 = lookup(bdev);
if (cmd2) {
mutex_unlock(&table_lock);
- __destroy_persistent_data_objects(cmd);
+ __destroy_persistent_data_objects(cmd, true);
kfree(cmd);
return cmd2;
}
@@ -874,7 +876,7 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
mutex_unlock(&table_lock);
if (!cmd->fail_io)
- __destroy_persistent_data_objects(cmd);
+ __destroy_persistent_data_objects(cmd, true);
kfree(cmd);
}
}
@@ -1808,14 +1810,52 @@ int dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd, bool *result)
int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
{
- int r;
+ int r = -EINVAL;
+ struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
+
+ /* fail_io is double-checked with cmd->root_lock held below */
+ if (unlikely(cmd->fail_io))
+ return r;
+
+ /*
+ * Replacement block manager (new_bm) is created and old_bm destroyed outside of
+ * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
+ * shrinker associated with the block manager's bufio client vs cmd root_lock).
+ * - must take shrinker_rwsem without holding cmd->root_lock
+ */
+ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
+ CACHE_MAX_CONCURRENT_LOCKS);
WRITE_LOCK(cmd);
- __destroy_persistent_data_objects(cmd);
- r = __create_persistent_data_objects(cmd, false);
+ if (cmd->fail_io) {
+ WRITE_UNLOCK(cmd);
+ goto out;
+ }
+
+ __destroy_persistent_data_objects(cmd, false);
+ old_bm = cmd->bm;
+ if (IS_ERR(new_bm)) {
+ DMERR("could not create block manager during abort");
+ cmd->bm = NULL;
+ r = PTR_ERR(new_bm);
+ goto out_unlock;
+ }
+
+ cmd->bm = new_bm;
+ r = __open_or_format_metadata(cmd, false);
+ if (r) {
+ cmd->bm = NULL;
+ goto out_unlock;
+ }
+ new_bm = NULL;
+out_unlock:
if (r)
cmd->fail_io = true;
WRITE_UNLOCK(cmd);
+ dm_block_manager_destroy(old_bm);
+out:
+ if (new_bm && !IS_ERR(new_bm))
+ dm_block_manager_destroy(new_bm);
return r;
}
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index b61aac00ff40..859073193f5b 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -854,7 +854,13 @@ struct smq_policy {
struct background_tracker *bg_work;
- bool migrations_allowed;
+ bool migrations_allowed:1;
+
+ /*
+ * If this is set the policy will try and clean the whole cache
+ * even if the device is not idle.
+ */
+ bool cleaner:1;
};
/*----------------------------------------------------------------*/
@@ -1133,7 +1139,7 @@ static bool clean_target_met(struct smq_policy *mq, bool idle)
* Cache entries may not be populated. So we cannot rely on the
* size of the clean queue.
*/
- if (idle) {
+ if (idle || mq->cleaner) {
/*
* We'd like to clean everything.
*/
@@ -1716,11 +1722,9 @@ static void calc_hotspot_params(sector_t origin_size,
*hotspot_block_size /= 2u;
}
-static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
- sector_t origin_size,
- sector_t cache_block_size,
- bool mimic_mq,
- bool migrations_allowed)
+static struct dm_cache_policy *
+__smq_create(dm_cblock_t cache_size, sector_t origin_size, sector_t cache_block_size,
+ bool mimic_mq, bool migrations_allowed, bool cleaner)
{
unsigned i;
unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
@@ -1807,6 +1811,7 @@ static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
goto bad_btracker;
mq->migrations_allowed = migrations_allowed;
+ mq->cleaner = cleaner;
return &mq->policy;
@@ -1830,21 +1835,24 @@ static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size)
{
- return __smq_create(cache_size, origin_size, cache_block_size, false, true);
+ return __smq_create(cache_size, origin_size, cache_block_size,
+ false, true, false);
}
static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size)
{
- return __smq_create(cache_size, origin_size, cache_block_size, true, true);
+ return __smq_create(cache_size, origin_size, cache_block_size,
+ true, true, false);
}
static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
sector_t origin_size,
sector_t cache_block_size)
{
- return __smq_create(cache_size, origin_size, cache_block_size, false, false);
+ return __smq_create(cache_size, origin_size, cache_block_size,
+ false, false, true);
}
/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index f595e9867cbe..08b5e44df324 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1011,16 +1011,16 @@ static void abort_transaction(struct cache *cache)
if (get_cache_mode(cache) >= CM_READ_ONLY)
return;
- if (dm_cache_metadata_set_needs_check(cache->cmd)) {
- DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
- set_cache_mode(cache, CM_FAIL);
- }
-
DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
if (dm_cache_metadata_abort(cache->cmd)) {
DMERR("%s: failed to abort metadata transaction", dev_name);
set_cache_mode(cache, CM_FAIL);
}
+
+ if (dm_cache_metadata_set_needs_check(cache->cmd)) {
+ DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
+ set_cache_mode(cache, CM_FAIL);
+ }
}
static void metadata_operation_failed(struct cache *cache, const char *op, int r)
@@ -1910,6 +1910,7 @@ static void process_deferred_bios(struct work_struct *ws)
else
commit_needed = process_bio(cache, bio) || commit_needed;
+ cond_resched();
}
if (commit_needed)
@@ -1932,6 +1933,7 @@ static void requeue_deferred_bios(struct cache *cache)
while ((bio = bio_list_pop(&bios))) {
bio->bi_status = BLK_STS_DM_REQUEUE;
bio_endio(bio);
+ cond_resched();
}
}
@@ -1972,6 +1974,8 @@ static void check_migrations(struct work_struct *ws)
r = mg_start(cache, op, NULL);
if (r)
break;
+
+ cond_resched();
}
}
@@ -1992,6 +1996,7 @@ static void destroy(struct cache *cache)
if (cache->prison)
dm_bio_prison_destroy_v2(cache->prison);
+ cancel_delayed_work_sync(&cache->waker);
if (cache->wq)
destroy_workqueue(cache->wq);
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index eb7a5d3ba81a..355afca2969d 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -1977,6 +1977,7 @@ static void clone_dtr(struct dm_target *ti)
mempool_exit(&clone->hydration_pool);
dm_kcopyd_client_destroy(clone->kcopyd_client);
+ cancel_delayed_work_sync(&clone->waker);
destroy_workqueue(clone->wq);
hash_table_exit(clone);
dm_clone_metadata_close(clone->cmd);
@@ -2231,6 +2232,7 @@ static int __init dm_clone_init(void)
r = dm_register_target(&clone_target);
if (r < 0) {
DMERR("Failed to register clone target");
+ kmem_cache_destroy(_hydration_cache);
return r;
}
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 3fea121fcbcf..e3c3bbe92677 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -18,6 +18,8 @@
#include "dm.h"
#define DM_RESERVED_MAX_IOS 1024
+#define DM_MAX_TARGETS 1048576
+#define DM_MAX_TARGET_PARAMS 1024
struct dm_kobject_holder {
struct kobject kobj;
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index fa674e9b6f23..6866aa2aade3 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -46,11 +46,11 @@
struct convert_context {
struct completion restart;
struct bio *bio_in;
- struct bio *bio_out;
struct bvec_iter iter_in;
+ struct bio *bio_out;
struct bvec_iter iter_out;
- u64 cc_sector;
atomic_t cc_pending;
+ u64 cc_sector;
union {
struct skcipher_request *req;
struct aead_request *req_aead;
@@ -1554,6 +1554,7 @@ pop_from_list:
io = crypt_io_from_node(rb_first(&write_tree));
rb_erase(&io->rb_node, &write_tree);
kcryptd_io_write(io);
+ cond_resched();
} while (!RB_EMPTY_ROOT(&write_tree));
blk_finish_plug(&plug);
}
@@ -1626,6 +1627,12 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
io->ctx.bio_out = clone;
io->ctx.iter_out = clone->bi_iter;
+ if (crypt_integrity_aead(cc)) {
+ bio_copy_data(clone, io->base_bio);
+ io->ctx.bio_in = clone;
+ io->ctx.iter_in = clone->bi_iter;
+ }
+
sector += bio_sectors(clone);
crypt_inc_pending(io);
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f496213f8b67..7c0e7c662e07 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -30,7 +30,7 @@ struct delay_c {
struct workqueue_struct *kdelayd_wq;
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
- atomic_t may_delay;
+ bool may_delay;
struct delay_class read;
struct delay_class write;
@@ -191,7 +191,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
INIT_LIST_HEAD(&dc->delayed_bios);
mutex_init(&dc->timer_lock);
- atomic_set(&dc->may_delay, 1);
+ dc->may_delay = true;
dc->argc = argc;
ret = delay_class_ctr(ti, &dc->read, argv);
@@ -245,7 +245,7 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
struct dm_delay_info *delayed;
unsigned long expires = 0;
- if (!c->delay || !atomic_read(&dc->may_delay))
+ if (!c->delay)
return DM_MAPIO_REMAPPED;
delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
@@ -254,6 +254,10 @@ static int delay_bio(struct delay_c *dc, struct delay_class *c, struct bio *bio)
delayed->expires = expires = jiffies + msecs_to_jiffies(c->delay);
mutex_lock(&delayed_bios_lock);
+ if (unlikely(!dc->may_delay)) {
+ mutex_unlock(&delayed_bios_lock);
+ return DM_MAPIO_REMAPPED;
+ }
c->ops++;
list_add_tail(&delayed->list, &dc->delayed_bios);
mutex_unlock(&delayed_bios_lock);
@@ -267,7 +271,10 @@ static void delay_presuspend(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
- atomic_set(&dc->may_delay, 0);
+ mutex_lock(&delayed_bios_lock);
+ dc->may_delay = false;
+ mutex_unlock(&delayed_bios_lock);
+
del_timer_sync(&dc->delay_timer);
flush_bios(flush_delayed_bios(dc, 1));
}
@@ -276,7 +283,7 @@ static void delay_resume(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
- atomic_set(&dc->may_delay, 1);
+ dc->may_delay = true;
}
static int delay_map(struct dm_target *ti, struct bio *bio)
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 2900fbde89b3..967d4ad6d32c 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -124,9 +124,9 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
* Direction r or w?
*/
arg_name = dm_shift_arg(as);
- if (!strcasecmp(arg_name, "w"))
+ if (arg_name && !strcasecmp(arg_name, "w"))
fc->corrupt_bio_rw = WRITE;
- else if (!strcasecmp(arg_name, "r"))
+ else if (arg_name && !strcasecmp(arg_name, "r"))
fc->corrupt_bio_rw = READ;
else {
ti->error = "Invalid corrupt bio direction (r or w)";
@@ -301,8 +301,11 @@ static void corrupt_bio_data(struct bio *bio, struct flakey_c *fc)
*/
bio_for_each_segment(bvec, bio, iter) {
if (bio_iter_len(bio, iter) > corrupt_bio_byte) {
- char *segment = (page_address(bio_iter_page(bio, iter))
- + bio_iter_offset(bio, iter));
+ char *segment;
+ struct page *page = bio_iter_page(bio, iter);
+ if (unlikely(page == ZERO_PAGE(0)))
+ break;
+ segment = (page_address(page) + bio_iter_offset(bio, iter));
segment[corrupt_bio_byte] = fc->corrupt_bio_value;
DMDEBUG("Corrupting data bio=%p by writing %u to byte %u "
"(rw=%c bi_opf=%u bi_sector=%llu size=%u)\n",
@@ -360,9 +363,11 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
/*
* Corrupt matching writes.
*/
- if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == WRITE)) {
- if (all_corrupt_bio_flags_match(bio, fc))
- corrupt_bio_data(bio, fc);
+ if (fc->corrupt_bio_byte) {
+ if (fc->corrupt_bio_rw == WRITE) {
+ if (all_corrupt_bio_flags_match(bio, fc))
+ corrupt_bio_data(bio, fc);
+ }
goto map_bio;
}
@@ -388,13 +393,14 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
return DM_ENDIO_DONE;
if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
- if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) &&
- all_corrupt_bio_flags_match(bio, fc)) {
- /*
- * Corrupt successful matching READs while in down state.
- */
- corrupt_bio_data(bio, fc);
-
+ if (fc->corrupt_bio_byte) {
+ if ((fc->corrupt_bio_rw == READ) &&
+ all_corrupt_bio_flags_match(bio, fc)) {
+ /*
+ * Corrupt successful matching READs while in down state.
+ */
+ corrupt_bio_data(bio, fc);
+ }
} else if (!test_bit(DROP_WRITES, &fc->flags) &&
!test_bit(ERROR_WRITES, &fc->flags)) {
/*
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index acbda91e7643..4b634633b4a5 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -31,11 +31,11 @@
#define DEFAULT_BUFFER_SECTORS 128
#define DEFAULT_JOURNAL_WATERMARK 50
#define DEFAULT_SYNC_MSEC 10000
-#define DEFAULT_MAX_JOURNAL_SECTORS 131072
+#define DEFAULT_MAX_JOURNAL_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 131072 : 8192)
#define MIN_LOG2_INTERLEAVE_SECTORS 3
#define MAX_LOG2_INTERLEAVE_SECTORS 31
#define METADATA_WORKQUEUE_MAX_ACTIVE 16
-#define RECALC_SECTORS 8192
+#define RECALC_SECTORS (IS_ENABLED(CONFIG_64BIT) ? 32768 : 2048)
#define RECALC_WRITE_SUPER 16
#define BITMAP_BLOCK_SIZE 4096 /* don't change it */
#define BITMAP_FLUSH_INTERVAL (10 * HZ)
@@ -1582,11 +1582,12 @@ static void integrity_metadata(struct work_struct *w)
}
__bio_for_each_segment(bv, bio, iter, dio->bio_details.bi_iter) {
+ struct bio_vec bv_copy = bv;
unsigned pos;
char *mem, *checksums_ptr;
again:
- mem = (char *)kmap_atomic(bv.bv_page) + bv.bv_offset;
+ mem = (char *)kmap_atomic(bv_copy.bv_page) + bv_copy.bv_offset;
pos = 0;
checksums_ptr = checksums;
do {
@@ -1595,7 +1596,7 @@ again:
sectors_to_process -= ic->sectors_per_block;
pos += ic->sectors_per_block << SECTOR_SHIFT;
sector += ic->sectors_per_block;
- } while (pos < bv.bv_len && sectors_to_process && checksums != checksums_onstack);
+ } while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack);
kunmap_atomic(mem);
r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
@@ -1615,9 +1616,9 @@ again:
if (!sectors_to_process)
break;
- if (unlikely(pos < bv.bv_len)) {
- bv.bv_offset += pos;
- bv.bv_len -= pos;
+ if (unlikely(pos < bv_copy.bv_len)) {
+ bv_copy.bv_offset += pos;
+ bv_copy.bv_len -= pos;
goto again;
}
}
@@ -2346,10 +2347,6 @@ static void integrity_writer(struct work_struct *w)
unsigned prev_free_sectors;
- /* the following test is not needed, but it tests the replay code */
- if (unlikely(dm_post_suspending(ic->ti)) && !ic->meta_dev)
- return;
-
spin_lock_irq(&ic->endio_wait.lock);
write_start = ic->committed_section;
write_sections = ic->n_committed_sections;
@@ -2858,8 +2855,7 @@ static void dm_integrity_postsuspend(struct dm_target *ti)
drain_workqueue(ic->commit_wq);
if (ic->mode == 'J') {
- if (ic->meta_dev)
- queue_work(ic->writer_wq, &ic->writer_work);
+ queue_work(ic->writer_wq, &ic->writer_work);
drain_workqueue(ic->writer_wq);
dm_integrity_flush_buffers(ic, true);
}
@@ -3759,7 +3755,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv)
} else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) {
log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval);
} else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) {
- if (val >= (uint64_t)UINT_MAX * 1000 / HZ) {
+ if ((uint64_t)val >= (uint64_t)UINT_MAX * 1000 / HZ) {
r = -EINVAL;
ti->error = "Invalid bitmap_flush_interval argument";
goto bad;
@@ -4200,6 +4196,8 @@ static void dm_integrity_dtr(struct dm_target *ti)
BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
BUG_ON(!list_empty(&ic->wait_list));
+ if (ic->mode == 'B')
+ cancel_delayed_work_sync(&ic->bitmap_flush_work);
if (ic->metadata_wq)
destroy_workqueue(ic->metadata_wq);
if (ic->wait_wq)
@@ -4291,11 +4289,13 @@ static int __init dm_integrity_init(void)
}
r = dm_register_target(&integrity_target);
-
- if (r < 0)
+ if (r < 0) {
DMERR("register failed %d", r);
+ kmem_cache_destroy(journal_io_cache);
+ return r;
+ }
- return r;
+ return 0;
}
static void __exit dm_integrity_exit(void)
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 81ffc59d05c9..4312007d2d34 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -306,7 +306,7 @@ static void do_region(int op, int op_flags, unsigned region,
struct request_queue *q = bdev_get_queue(where->bdev);
unsigned short logical_block_size = queue_logical_block_size(q);
sector_t num_sectors;
- unsigned int uninitialized_var(special_cmd_max_sectors);
+ unsigned int special_cmd_max_sectors;
/*
* Reject unsupported discard and write same requests.
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 7a73f2fa0ad7..e89e710dd292 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -573,7 +573,7 @@ static void list_version_get_needed(struct target_type *tt, void *needed_param)
size_t *needed = needed_param;
*needed += sizeof(struct dm_target_versions);
- *needed += strlen(tt->name);
+ *needed += strlen(tt->name) + 1;
*needed += ALIGN_MASK;
}
@@ -638,7 +638,7 @@ static int __list_versions(struct dm_ioctl *param, size_t param_size, const char
iter_info.old_vers = NULL;
iter_info.vers = vers;
iter_info.flags = 0;
- iter_info.end = (char *)vers+len;
+ iter_info.end = (char *)vers + needed;
/*
* Now loop through filling out the names & versions.
@@ -1435,11 +1435,12 @@ static int table_clear(struct file *filp, struct dm_ioctl *param, size_t param_s
hc->new_map = NULL;
}
- param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
-
- __dev_status(hc->md, param);
md = hc->md;
up_write(&_hash_lock);
+
+ param->flags &= ~DM_INACTIVE_PRESENT_FLAG;
+ __dev_status(md, param);
+
if (old_map) {
dm_sync_table(md);
dm_table_destroy(old_map);
@@ -1759,7 +1760,8 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern
if (copy_from_user(param_kernel, user, minimum_data_size))
return -EFAULT;
- if (param_kernel->data_size < minimum_data_size)
+ if (unlikely(param_kernel->data_size < minimum_data_size) ||
+ unlikely(param_kernel->data_size > DM_MAX_TARGETS * DM_MAX_TARGET_PARAMS))
return -EINVAL;
secure_data = param_kernel->flags & DM_SECURE_DATA_FLAG;
@@ -1847,7 +1849,7 @@ static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *us
int ioctl_flags;
int param_flags;
unsigned int cmd;
- struct dm_ioctl *uninitialized_var(param);
+ struct dm_ioctl *param;
ioctl_fn fn = NULL;
size_t input_param_size;
struct dm_ioctl param_kernel;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 42151c9fc6e5..25eecb92f5f3 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -3284,15 +3284,19 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
/* Try to adjust the raid4/5/6 stripe cache size to the stripe size */
if (rs_is_raid456(rs)) {
r = rs_set_raid456_stripe_cache(rs);
- if (r)
+ if (r) {
+ mddev_unlock(&rs->md);
goto bad_stripe_cache;
+ }
}
/* Now do an early reshape check */
if (test_bit(RT_FLAG_RESHAPE_RS, &rs->runtime_flags)) {
r = rs_check_reshape(rs);
- if (r)
+ if (r) {
+ mddev_unlock(&rs->md);
goto bad_check_reshape;
+ }
/* Restore new, ctr requested layout to perform check */
rs_config_restore(rs, &rs_layout);
@@ -3301,6 +3305,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
r = rs->md.pers->check_reshape(&rs->md);
if (r) {
ti->error = "Reshape check failed";
+ mddev_unlock(&rs->md);
goto bad_check_reshape;
}
}
@@ -3338,14 +3343,14 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
struct mddev *mddev = &rs->md;
/*
- * If we're reshaping to add disk(s)), ti->len and
+ * If we're reshaping to add disk(s), ti->len and
* mddev->array_sectors will differ during the process
* (ti->len > mddev->array_sectors), so we have to requeue
* bios with addresses > mddev->array_sectors here or
* there will occur accesses past EOD of the component
* data images thus erroring the raid set.
*/
- if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
+ if (unlikely(bio_has_data(bio) && bio_end_sector(bio) > mddev->array_sectors))
return DM_MAPIO_REQUEUE;
md_handle_request(mddev, bio);
@@ -3528,7 +3533,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
{
struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md;
- struct r5conf *conf = mddev->private;
+ struct r5conf *conf = rs_is_raid456(rs) ? mddev->private : NULL;
int i, max_nr_stripes = conf ? conf->max_nr_stripes : 0;
unsigned long recovery;
unsigned int raid_param_cnt = 1; /* at least 1 for chunksize */
@@ -3808,7 +3813,7 @@ static void attempt_restore_of_faulty_devices(struct raid_set *rs)
memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
- for (i = 0; i < mddev->raid_disks; i++) {
+ for (i = 0; i < rs->raid_disks; i++) {
r = &rs->dev[i].rdev;
/* HM FIXME: enhance journal device recovery processing */
if (test_bit(Journal, &r->flags))
@@ -4022,7 +4027,9 @@ static void raid_resume(struct dm_target *ti)
* Take this opportunity to check whether any failed
* devices are reachable again.
*/
+ mddev_lock_nointr(mddev);
attempt_restore_of_faulty_devices(rs);
+ mddev_unlock(mddev);
}
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 963d3774c93e..247089c2be25 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -613,7 +613,7 @@ static int persistent_read_metadata(struct dm_exception_store *store,
chunk_t old, chunk_t new),
void *callback_context)
{
- int r, uninitialized_var(new_snapshot);
+ int r, new_snapshot;
struct pstore *ps = get_info(store);
/*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index e902aae685af..d8902d2b6aa6 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -685,8 +685,10 @@ static void dm_exception_table_exit(struct dm_exception_table *et,
for (i = 0; i < size; i++) {
slot = et->table + i;
- hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
+ hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list) {
kmem_cache_free(mem, ex);
+ cond_resched();
+ }
}
vfree(et->table);
diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index ce6d3bce1b7b..0f42f4a7f382 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -188,7 +188,7 @@ static int dm_stat_in_flight(struct dm_stat_shared *shared)
atomic_read(&shared->in_flight[WRITE]);
}
-void dm_stats_init(struct dm_stats *stats)
+int dm_stats_init(struct dm_stats *stats)
{
int cpu;
struct dm_stats_last_position *last;
@@ -196,11 +196,16 @@ void dm_stats_init(struct dm_stats *stats)
mutex_init(&stats->mutex);
INIT_LIST_HEAD(&stats->list);
stats->last = alloc_percpu(struct dm_stats_last_position);
+ if (!stats->last)
+ return -ENOMEM;
+
for_each_possible_cpu(cpu) {
last = per_cpu_ptr(stats->last, cpu);
last->last_sector = (sector_t)ULLONG_MAX;
last->last_rw = UINT_MAX;
}
+
+ return 0;
}
void dm_stats_cleanup(struct dm_stats *stats)
diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h
index 2ddfae678f32..dcac11fce03b 100644
--- a/drivers/md/dm-stats.h
+++ b/drivers/md/dm-stats.h
@@ -22,7 +22,7 @@ struct dm_stats_aux {
unsigned long long duration_ns;
};
-void dm_stats_init(struct dm_stats *st);
+int dm_stats_init(struct dm_stats *st);
void dm_stats_cleanup(struct dm_stats *st);
struct mapped_device;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 81bc36a43b32..fcb9e2775f78 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -184,7 +184,12 @@ static int alloc_targets(struct dm_table *t, unsigned int num)
int dm_table_create(struct dm_table **result, fmode_t mode,
unsigned num_targets, struct mapped_device *md)
{
- struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL);
+ struct dm_table *t;
+
+ if (num_targets > DM_MAX_TARGETS)
+ return -EOVERFLOW;
+
+ t = kzalloc(sizeof(*t), GFP_KERNEL);
if (!t)
return -ENOMEM;
@@ -199,7 +204,7 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
if (!num_targets) {
kfree(t);
- return -ENOMEM;
+ return -EOVERFLOW;
}
if (alloc_targets(t, num_targets)) {
@@ -668,7 +673,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table,
*/
unsigned short remaining = 0;
- struct dm_target *uninitialized_var(ti);
+ struct dm_target *ti;
struct queue_limits ti_limits;
unsigned i;
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index a5ed59eafdc5..92e646d597ea 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -701,6 +701,15 @@ static int __open_metadata(struct dm_pool_metadata *pmd)
goto bad_cleanup_data_sm;
}
+ /*
+ * For pool metadata opening process, root setting is redundant
+ * because it will be set again in __begin_transaction(). But dm
+ * pool aborting process really needs to get last transaction's
+ * root to avoid accessing broken btree.
+ */
+ pmd->root = le64_to_cpu(disk_super->data_mapping_root);
+ pmd->details_root = le64_to_cpu(disk_super->device_details_root);
+
__setup_btree_details(pmd);
dm_bm_unlock(sblock);
@@ -753,13 +762,15 @@ static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool f
return r;
}
-static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
+static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd,
+ bool destroy_bm)
{
dm_sm_destroy(pmd->data_sm);
dm_sm_destroy(pmd->metadata_sm);
dm_tm_destroy(pmd->nb_tm);
dm_tm_destroy(pmd->tm);
- dm_block_manager_destroy(pmd->bm);
+ if (destroy_bm)
+ dm_block_manager_destroy(pmd->bm);
}
static int __begin_transaction(struct dm_pool_metadata *pmd)
@@ -966,7 +977,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
}
pmd_write_unlock(pmd);
if (!pmd->fail_io)
- __destroy_persistent_data_objects(pmd);
+ __destroy_persistent_data_objects(pmd, true);
kfree(pmd);
return 0;
@@ -1875,19 +1886,52 @@ static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
{
int r = -EINVAL;
+ struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
+
+ /* fail_io is double-checked with pmd->root_lock held below */
+ if (unlikely(pmd->fail_io))
+ return r;
+
+ /*
+ * Replacement block manager (new_bm) is created and old_bm destroyed outside of
+ * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
+ * shrinker associated with the block manager's bufio client vs pmd root_lock).
+ * - must take shrinker_rwsem without holding pmd->root_lock
+ */
+ new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
+ THIN_MAX_CONCURRENT_LOCKS);
pmd_write_lock(pmd);
- if (pmd->fail_io)
+ if (pmd->fail_io) {
+ pmd_write_unlock(pmd);
goto out;
+ }
__set_abort_with_changes_flags(pmd);
- __destroy_persistent_data_objects(pmd);
- r = __create_persistent_data_objects(pmd, false);
+ __destroy_persistent_data_objects(pmd, false);
+ old_bm = pmd->bm;
+ if (IS_ERR(new_bm)) {
+ DMERR("could not create block manager during abort");
+ pmd->bm = NULL;
+ r = PTR_ERR(new_bm);
+ goto out_unlock;
+ }
+
+ pmd->bm = new_bm;
+ r = __open_or_format_metadata(pmd, false);
+ if (r) {
+ pmd->bm = NULL;
+ goto out_unlock;
+ }
+ new_bm = NULL;
+out_unlock:
if (r)
pmd->fail_io = true;
-
-out:
pmd_write_unlock(pmd);
+ dm_block_manager_destroy(old_bm);
+out:
+ if (new_bm && !IS_ERR(new_bm))
+ dm_block_manager_destroy(new_bm);
return r;
}
@@ -2060,10 +2104,13 @@ int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
dm_sm_threshold_fn fn,
void *context)
{
- int r;
+ int r = -EINVAL;
pmd_write_lock_in_core(pmd);
- r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
+ if (!pmd->fail_io) {
+ r = dm_sm_register_threshold_callback(pmd->metadata_sm,
+ threshold, fn, context);
+ }
pmd_write_unlock(pmd);
return r;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 1b2c98b43519..a1abdb3467a9 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2224,6 +2224,7 @@ static void process_thin_deferred_bios(struct thin_c *tc)
throttle_work_update(&pool->throttle);
dm_pool_issue_prefetches(pool->pmd);
}
+ cond_resched();
}
blk_finish_plug(&plug);
}
@@ -2307,6 +2308,7 @@ static void process_thin_deferred_cells(struct thin_c *tc)
else
pool->process_cell(tc, cell);
}
+ cond_resched();
} while (!list_empty(&cells));
}
@@ -2931,6 +2933,8 @@ static void __pool_destroy(struct pool *pool)
dm_bio_prison_destroy(pool->prison);
dm_kcopyd_client_destroy(pool->copier);
+ cancel_delayed_work_sync(&pool->waker);
+ cancel_delayed_work_sync(&pool->no_space_timeout);
if (pool->wq)
destroy_workqueue(pool->wq);
@@ -3403,6 +3407,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
pt->adjusted_pf = pt->requested_pf = pf;
bio_init(&pt->flush_bio, NULL, 0);
ti->num_flush_bios = 1;
+ ti->limit_swap_bios = true;
/*
* Only need to enable discards if the pool should pass
@@ -3425,8 +3430,10 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
calc_metadata_threshold(pt),
metadata_low_callback,
pool);
- if (r)
+ if (r) {
+ ti->error = "Error registering metadata threshold";
goto out_flags_changed;
+ }
pt->callbacks.congested_fn = pool_is_congested;
dm_table_add_target_callbacks(ti->table, &pt->callbacks);
@@ -3589,23 +3596,31 @@ static int pool_preresume(struct dm_target *ti)
*/
r = bind_control_target(pool, ti);
if (r)
- return r;
+ goto out;
dm_pool_register_pre_commit_callback(pool->pmd,
metadata_pre_commit_callback, pt);
r = maybe_resize_data_dev(ti, &need_commit1);
if (r)
- return r;
+ goto out;
r = maybe_resize_metadata_dev(ti, &need_commit2);
if (r)
- return r;
+ goto out;
if (need_commit1 || need_commit2)
(void) commit(pool);
+out:
+ /*
+ * When a thin-pool is PM_FAIL, it cannot be rebuilt if
+ * bio is in deferred list. Therefore need to return 0
+ * to allow pool_resume() to flush IO.
+ */
+ if (r && get_pool_mode(pool) == PM_FAIL)
+ r = 0;
- return 0;
+ return r;
}
static void pool_suspend_active_thins(struct pool *pool)
@@ -4278,6 +4293,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
goto bad;
ti->num_flush_bios = 1;
+ ti->limit_swap_bios = true;
ti->flush_supported = true;
ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c
index cea2b3789736..442437e4e03b 100644
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -24,7 +24,8 @@ bool verity_fec_is_enabled(struct dm_verity *v)
*/
static inline struct dm_verity_fec_io *fec_io(struct dm_verity_io *io)
{
- return (struct dm_verity_fec_io *) verity_io_digest_end(io->v, io);
+ return (struct dm_verity_fec_io *)
+ ((char *)io + io->v->ti->per_io_data_size - sizeof(struct dm_verity_fec_io));
}
/*
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c
index 9dcdf34b7e32..cebc7828ea8b 100644
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -471,13 +471,14 @@ static int verity_verify_io(struct dm_verity_io *io)
struct bvec_iter start;
unsigned b;
struct crypto_wait wait;
+ struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size);
for (b = 0; b < io->n_blocks; b++) {
int r;
sector_t cur_block = io->block + b;
struct ahash_request *req = verity_io_hash_req(v, io);
- if (v->validated_blocks &&
+ if (v->validated_blocks && bio->bi_status == BLK_STS_OK &&
likely(test_bit(cur_block, v->validated_blocks))) {
verity_bv_skip_block(v, io, &io->iter);
continue;
@@ -525,9 +526,17 @@ static int verity_verify_io(struct dm_verity_io *io)
else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA,
cur_block, NULL, &start) == 0)
continue;
- else if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
- cur_block))
- return -EIO;
+ else {
+ if (bio->bi_status) {
+ /*
+ * Error correction failed; Just return error
+ */
+ return -EIO;
+ }
+ if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+ cur_block))
+ return -EIO;
+ }
}
return 0;
@@ -570,7 +579,9 @@ static void verity_end_io(struct bio *bio)
struct dm_verity_io *io = bio->bi_private;
if (bio->bi_status &&
- (!verity_fec_is_enabled(io->v) || verity_is_system_shutting_down())) {
+ (!verity_fec_is_enabled(io->v) ||
+ verity_is_system_shutting_down() ||
+ (bio->bi_opf & REQ_RAHEAD))) {
verity_finish_io(io, bio->bi_status);
return;
}
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h
index 641b9e3a399b..fee7c7a81ce4 100644
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -73,11 +73,11 @@ struct dm_verity_io {
/* original value of bio->bi_end_io */
bio_end_io_t *orig_bi_end_io;
+ struct bvec_iter iter;
+
sector_t block;
unsigned n_blocks;
- struct bvec_iter iter;
-
struct work_struct work;
/*
@@ -110,12 +110,6 @@ static inline u8 *verity_io_want_digest(struct dm_verity *v,
return (u8 *)(io + 1) + v->ahash_reqsize + v->digest_size;
}
-static inline u8 *verity_io_digest_end(struct dm_verity *v,
- struct dm_verity_io *io)
-{
- return verity_io_want_digest(v, io) + v->digest_size;
-}
-
extern int verity_for_bv_block(struct dm_verity *v, struct dm_verity_io *io,
struct bvec_iter *iter,
int (*process)(struct dm_verity *v,
diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
index ec10fda3f24f..961498d8f406 100644
--- a/drivers/md/dm-writecache.c
+++ b/drivers/md/dm-writecache.c
@@ -20,7 +20,7 @@
#define HIGH_WATERMARK 50
#define LOW_WATERMARK 45
-#define MAX_WRITEBACK_JOBS 0
+#define MAX_WRITEBACK_JOBS min(0x10000000 / PAGE_SIZE, totalram_pages() / 16)
#define ENDIO_LATENCY 16
#define WRITEBACK_LATENCY 64
#define AUTOCOMMIT_BLOCKS_SSD 65536
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 77e28f77c59f..a7724ba45b43 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -263,7 +263,6 @@ out_uevent_exit:
static void local_exit(void)
{
- flush_scheduled_work();
destroy_workqueue(deferred_remove_workqueue);
unregister_blkdev(_major, _name);
@@ -1995,7 +1994,9 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->bdev)
goto bad;
- dm_stats_init(&md->stats);
+ r = dm_stats_init(&md->stats);
+ if (r < 0)
+ goto bad;
/* Populate the mapping, nobody knows we exist yet */
spin_lock(&_minor_lock);
@@ -2808,6 +2809,9 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla
static void __dm_internal_resume(struct mapped_device *md)
{
+ int r;
+ struct dm_table *map;
+
BUG_ON(!md->internal_suspend_count);
if (--md->internal_suspend_count)
@@ -2816,12 +2820,23 @@ static void __dm_internal_resume(struct mapped_device *md)
if (dm_suspended_md(md))
goto done; /* resume from nested suspend */
- /*
- * NOTE: existing callers don't need to call dm_table_resume_targets
- * (which may fail -- so best to avoid it for now by passing NULL map)
- */
- (void) __dm_resume(md, NULL);
-
+ map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
+ r = __dm_resume(md, map);
+ if (r) {
+ /*
+ * If a preresume method of some target failed, we are in a
+ * tricky situation. We can't return an error to the caller. We
+ * can't fake success because then the "resume" and
+ * "postsuspend" methods would not be paired correctly, and it
+ * would break various targets, for example it would cause list
+ * corruption in the "origin" target.
+ *
+ * So, we fake normal suspend here, to make sure that the
+ * "resume" and "postsuspend" methods will be paired correctly.
+ */
+ DMERR("Preresume method failed: %d", r);
+ set_bit(DMF_SUSPENDED, &md->flags);
+ }
done:
clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
smp_mb__after_atomic();
@@ -3080,6 +3095,11 @@ static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
goto out;
ti = dm_table_get_target(table, 0);
+ if (dm_suspended_md(md)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
ret = -EINVAL;
if (!ti->type->iterate_devices)
goto out;
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index a95e20c3d0d4..843139447a96 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -54,14 +54,7 @@ __acquires(bitmap->lock)
{
unsigned char *mappage;
- if (page >= bitmap->pages) {
- /* This can happen if bitmap_start_sync goes beyond
- * End-of-device while looking for a whole page.
- * It is harmless.
- */
- return -EINVAL;
- }
-
+ WARN_ON_ONCE(page >= bitmap->pages);
if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */
return 0;
@@ -489,7 +482,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
sb = kmap_atomic(bitmap->storage.sb_page);
pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
- pr_debug(" version: %d\n", le32_to_cpu(sb->version));
+ pr_debug(" version: %u\n", le32_to_cpu(sb->version));
pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
le32_to_cpu(*(__le32 *)(sb->uuid+0)),
le32_to_cpu(*(__le32 *)(sb->uuid+4)),
@@ -500,11 +493,11 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
pr_debug("events cleared: %llu\n",
(unsigned long long) le64_to_cpu(sb->events_cleared));
pr_debug(" state: %08x\n", le32_to_cpu(sb->state));
- pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize));
- pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
+ pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize));
+ pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep));
pr_debug(" sync size: %llu KB\n",
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
- pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
+ pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
kunmap_atomic(sb);
}
@@ -1369,6 +1362,14 @@ __acquires(bitmap->lock)
sector_t csize;
int err;
+ if (page >= bitmap->pages) {
+ /*
+ * This can happen if bitmap_start_sync goes beyond
+ * End-of-device while looking for a whole page or
+ * user set a huge number to sysfs bitmap_set_bits.
+ */
+ return NULL;
+ }
err = md_bitmap_checkpage(bitmap, page, create, 0);
if (bitmap->bp[page].hijacked ||
@@ -2110,7 +2111,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bytes = DIV_ROUND_UP(chunks, 8);
if (!bitmap->mddev->bitmap_info.external)
bytes += sizeof(bitmap_super_t);
- } while (bytes > (space << 9));
+ } while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) <
+ (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1));
} else
chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
@@ -2155,7 +2157,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bitmap->counts.missing_pages = pages;
bitmap->counts.chunkshift = chunkshift;
bitmap->counts.chunks = chunks;
- bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
+ bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift +
BITMAP_BLOCK_SHIFT);
blocks = min(old_counts.chunks << old_counts.chunkshift,
@@ -2181,8 +2183,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bitmap->counts.missing_pages = old_counts.pages;
bitmap->counts.chunkshift = old_counts.chunkshift;
bitmap->counts.chunks = old_counts.chunks;
- bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
- BITMAP_BLOCK_SHIFT);
+ bitmap->mddev->bitmap_info.chunksize =
+ 1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT);
blocks = old_counts.chunks << old_counts.chunkshift;
pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
break;
@@ -2200,20 +2202,23 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
if (set) {
bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
- if (*bmc_new == 0) {
- /* need to set on-disk bits too. */
- sector_t end = block + new_blocks;
- sector_t start = block >> chunkshift;
- start <<= chunkshift;
- while (start < end) {
- md_bitmap_file_set_bit(bitmap, block);
- start += 1 << chunkshift;
+ if (bmc_new) {
+ if (*bmc_new == 0) {
+ /* need to set on-disk bits too. */
+ sector_t end = block + new_blocks;
+ sector_t start = block >> chunkshift;
+
+ start <<= chunkshift;
+ while (start < end) {
+ md_bitmap_file_set_bit(bitmap, block);
+ start += 1 << chunkshift;
+ }
+ *bmc_new = 2;
+ md_bitmap_count_page(&bitmap->counts, block, 1);
+ md_bitmap_set_pending(&bitmap->counts, block);
}
- *bmc_new = 2;
- md_bitmap_count_page(&bitmap->counts, block, 1);
- md_bitmap_set_pending(&bitmap->counts, block);
+ *bmc_new |= NEEDED_MASK;
}
- *bmc_new |= NEEDED_MASK;
if (new_blocks < old_blocks)
old_blocks = new_blocks;
}
@@ -2475,11 +2480,35 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long backlog;
unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
+ struct md_rdev *rdev;
+ bool has_write_mostly = false;
int rv = kstrtoul(buf, 10, &backlog);
if (rv)
return rv;
if (backlog > COUNTER_MAX)
return -EINVAL;
+
+ rv = mddev_lock(mddev);
+ if (rv)
+ return rv;
+
+ /*
+ * Without write mostly device, it doesn't make sense to set
+ * backlog for max_write_behind.
+ */
+ rdev_for_each(rdev, mddev) {
+ if (test_bit(WriteMostly, &rdev->flags)) {
+ has_write_mostly = true;
+ break;
+ }
+ }
+ if (!has_write_mostly) {
+ pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n",
+ mdname(mddev));
+ mddev_unlock(mddev);
+ return -EINVAL;
+ }
+
mddev->bitmap_info.max_write_behind = backlog;
if (!backlog && mddev->wb_info_pool) {
/* wb_info_pool is not needed if backlog is zero */
@@ -2494,6 +2523,8 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
+
+ mddev_unlock(mddev);
return len;
}
@@ -2520,6 +2551,9 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
if (csize < 512 ||
!is_power_of_2(csize))
return -EINVAL;
+ if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE *
+ sizeof(((bitmap_super_t *)0)->chunksize))))
+ return -EOVERFLOW;
mddev->bitmap_info.chunksize = csize;
return len;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 11fd3b32b562..61c3e8df1b55 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -481,13 +481,14 @@ static void md_end_flush(struct bio *bio)
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
+ bio_put(bio);
+
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) {
/* The pre-request flush has finished */
queue_work(md_wq, &mddev->flush_work);
}
- bio_put(bio);
}
static void md_submit_flush_data(struct work_struct *ws);
@@ -885,10 +886,12 @@ static void super_written(struct bio *bio)
} else
clear_bit(LastDev, &rdev->flags);
+ bio_put(bio);
+
+ rdev_dec_pending(rdev, mddev);
+
if (atomic_dec_and_test(&mddev->pending_writes))
wake_up(&mddev->sb_wait);
- rdev_dec_pending(rdev, mddev);
- bio_put(bio);
}
void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
@@ -1095,6 +1098,7 @@ struct super_type {
struct md_rdev *refdev,
int minor_version);
int (*validate_super)(struct mddev *mddev,
+ struct md_rdev *freshest,
struct md_rdev *rdev);
void (*sync_super)(struct mddev *mddev,
struct md_rdev *rdev);
@@ -1233,8 +1237,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
/*
* validate_super for 0.90.0
+ * note: we are not using "freshest" for 0.9 superblock
*/
-static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
{
mdp_disk_t *desc;
mdp_super_t *sb = page_address(rdev->sb_page);
@@ -1749,7 +1754,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
return ret;
}
-static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
{
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
__u64 ev1 = le64_to_cpu(sb->events);
@@ -1845,13 +1850,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
}
} else if (mddev->pers == NULL) {
/* Insist of good event counter while assembling, except for
- * spares (which don't need an event count) */
- ++ev1;
+ * spares (which don't need an event count).
+ * Similar to mdadm, we allow event counter difference of 1
+ * from the freshest device.
+ */
if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
- if (ev1 < mddev->events)
+ if (ev1 + 1 < mddev->events)
return -EINVAL;
} else if (mddev->bitmap) {
/* If adding to array with a bitmap, then we can accept an
@@ -1872,8 +1879,38 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
- } else
+ } else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
+ /*
+ * If we are assembling, and our event counter is smaller than the
+ * highest event counter, we cannot trust our superblock about the role.
+ * It could happen that our rdev was marked as Faulty, and all other
+ * superblocks were updated with +1 event counter.
+ * Then, before the next superblock update, which typically happens when
+ * remove_and_add_spares() removes the device from the array, there was
+ * a crash or reboot.
+ * If we allow current rdev without consulting the freshest superblock,
+ * we could cause data corruption.
+ * Note that in this case our event counter is smaller by 1 than the
+ * highest, otherwise, this rdev would not be allowed into array;
+ * both kernel and mdadm allow event counter difference of 1.
+ */
+ struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
+ u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
+
+ if (rdev->desc_nr >= freshest_max_dev) {
+ /* this is unexpected, better not proceed */
+ pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
+ mdname(mddev), rdev->bdev, rdev->desc_nr,
+ freshest->bdev, freshest_max_dev);
+ return -EUCLEAN;
+ }
+
+ role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
+ pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
+ mdname(mddev), rdev->bdev, role, role, freshest->bdev);
+ } else {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ }
switch(role) {
case MD_DISK_ROLE_SPARE: /* spare */
break;
@@ -2777,7 +2814,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
* and should be added immediately.
*/
super_types[mddev->major_version].
- validate_super(mddev, rdev);
+ validate_super(mddev, NULL/*freshest*/, rdev);
if (add_journal)
mddev_suspend(mddev);
err = mddev->pers->hot_add_disk(mddev, rdev);
@@ -3079,6 +3116,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
err = kstrtouint(buf, 10, (unsigned int *)&slot);
if (err < 0)
return err;
+ if (slot < 0)
+ /* overflow */
+ return -ENOSPC;
}
if (rdev->mddev->pers && slot == -1) {
/* Setting 'slot' on an active array requires also
@@ -3684,7 +3724,7 @@ static int analyze_sbs(struct mddev *mddev)
}
super_types[mddev->major_version].
- validate_super(mddev, freshest);
+ validate_super(mddev, NULL/*freshest*/, freshest);
i = 0;
rdev_for_each_safe(rdev, tmp, mddev) {
@@ -3699,7 +3739,7 @@ static int analyze_sbs(struct mddev *mddev)
}
if (rdev != freshest) {
if (super_types[mddev->major_version].
- validate_super(mddev, rdev)) {
+ validate_super(mddev, freshest, rdev)) {
pr_warn("md: kicking non-fresh %s from array!\n",
bdevname(rdev->bdev,b));
md_kick_rdev_from_array(rdev);
@@ -3760,8 +3800,9 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
static ssize_t
safe_delay_show(struct mddev *mddev, char *page)
{
- int msec = (mddev->safemode_delay*1000)/HZ;
- return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
+ unsigned int msec = ((unsigned long)mddev->safemode_delay*1000)/HZ;
+
+ return sprintf(page, "%u.%03u\n", msec/1000, msec%1000);
}
static ssize_t
safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
@@ -3773,7 +3814,7 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
return -EINVAL;
}
- if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
+ if (strict_strtoul_scaled(cbuf, &msec, 3) < 0 || msec > UINT_MAX / HZ)
return -EINVAL;
if (msec == 0)
mddev->safemode_delay = 0;
@@ -4434,6 +4475,8 @@ max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len
rv = kstrtouint(buf, 10, &n);
if (rv < 0)
return rv;
+ if (n > INT_MAX)
+ return -EINVAL;
atomic_set(&mddev->max_corr_read_errors, n);
return len;
}
@@ -4734,11 +4777,21 @@ action_store(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
err = mddev_lock(mddev);
if (!err) {
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
err = -EBUSY;
- else {
+ } else if (mddev->reshape_position == MaxSector ||
+ mddev->pers->check_reshape == NULL ||
+ mddev->pers->check_reshape(mddev)) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
err = mddev->pers->start_reshape(mddev);
+ } else {
+ /*
+ * If reshape is still in progress, and
+ * md_check_recovery() can continue to reshape,
+ * don't restart reshape because data can be
+ * corrupted for raid456.
+ */
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
}
mddev_unlock(mddev);
}
@@ -6094,6 +6147,7 @@ void md_stop(struct mddev *mddev)
/* stop the array and free an attached data structures.
* This is called from dm-raid
*/
+ __md_stop_writes(mddev);
__md_stop(mddev);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
@@ -6590,7 +6644,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
rdev->saved_raid_disk = rdev->raid_disk;
} else
super_types[mddev->major_version].
- validate_super(mddev, rdev);
+ validate_super(mddev, NULL/*freshest*/, rdev);
if ((info->state & (1<<MD_DISK_SYNC)) &&
rdev->raid_disk != info->raid_disk) {
/* This was a hot-add request, but events doesn't
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 0ead5a7887f1..7f80e86459b1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -63,8 +63,8 @@ static void dump_zones(struct mddev *mddev)
int len = 0;
for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
- len += snprintf(line+len, 200-len, "%s%s", k?"/":"",
- bdevname(conf->devlist[j*raid_disks
+ len += scnprintf(line+len, 200-len, "%s%s", k?"/":"",
+ bdevname(conf->devlist[j*raid_disks
+ k]->bdev, b));
pr_debug("md: zone%d=[%s]\n", j, line);
@@ -289,6 +289,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
goto abort;
}
+ if (conf->layout == RAID0_ORIG_LAYOUT) {
+ for (i = 1; i < conf->nr_strip_zones; i++) {
+ sector_t first_sector = conf->strip_zone[i-1].zone_end;
+
+ sector_div(first_sector, mddev->chunk_sectors);
+ zone = conf->strip_zone + i;
+ /* disk_shift is first disk index used in the zone */
+ zone->disk_shift = sector_div(first_sector,
+ zone->nb_dev);
+ }
+ }
+
pr_debug("md/raid0:%s: done.\n", mdname(mddev));
*private_conf = conf;
@@ -475,6 +487,20 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
}
}
+/*
+ * Convert disk_index to the disk order in which it is read/written.
+ * For example, if we have 4 disks, they are numbered 0,1,2,3. If we
+ * write the disks starting at disk 3, then the read/write order would
+ * be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift()
+ * to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map
+ * to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in
+ * that 'output' space to understand the read/write disk ordering.
+ */
+static int map_disk_shift(int disk_index, int num_disks, int disk_shift)
+{
+ return ((disk_index + num_disks - disk_shift) % num_disks);
+}
+
static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
{
struct r0conf *conf = mddev->private;
@@ -488,7 +514,9 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
sector_t end_disk_offset;
unsigned int end_disk_index;
unsigned int disk;
+ sector_t orig_start, orig_end;
+ orig_start = start;
zone = find_zone(conf, &start);
if (bio_end_sector(bio) > zone->zone_end) {
@@ -502,6 +530,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
} else
end = bio_end_sector(bio);
+ orig_end = end;
if (zone != conf->strip_zone)
end = end - zone[-1].zone_end;
@@ -513,13 +542,26 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
last_stripe_index = end;
sector_div(last_stripe_index, stripe_size);
- start_disk_index = (int)(start - first_stripe_index * stripe_size) /
- mddev->chunk_sectors;
+ /* In the first zone the original and alternate layouts are the same */
+ if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) {
+ sector_div(orig_start, mddev->chunk_sectors);
+ start_disk_index = sector_div(orig_start, zone->nb_dev);
+ start_disk_index = map_disk_shift(start_disk_index,
+ zone->nb_dev,
+ zone->disk_shift);
+ sector_div(orig_end, mddev->chunk_sectors);
+ end_disk_index = sector_div(orig_end, zone->nb_dev);
+ end_disk_index = map_disk_shift(end_disk_index,
+ zone->nb_dev, zone->disk_shift);
+ } else {
+ start_disk_index = (int)(start - first_stripe_index * stripe_size) /
+ mddev->chunk_sectors;
+ end_disk_index = (int)(end - last_stripe_index * stripe_size) /
+ mddev->chunk_sectors;
+ }
start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
mddev->chunk_sectors) +
first_stripe_index * mddev->chunk_sectors;
- end_disk_index = (int)(end - last_stripe_index * stripe_size) /
- mddev->chunk_sectors;
end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
mddev->chunk_sectors) +
last_stripe_index * mddev->chunk_sectors;
@@ -528,18 +570,22 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
sector_t dev_start, dev_end;
struct bio *discard_bio = NULL;
struct md_rdev *rdev;
+ int compare_disk;
+
+ compare_disk = map_disk_shift(disk, zone->nb_dev,
+ zone->disk_shift);
- if (disk < start_disk_index)
+ if (compare_disk < start_disk_index)
dev_start = (first_stripe_index + 1) *
mddev->chunk_sectors;
- else if (disk > start_disk_index)
+ else if (compare_disk > start_disk_index)
dev_start = first_stripe_index * mddev->chunk_sectors;
else
dev_start = start_disk_offset;
- if (disk < end_disk_index)
+ if (compare_disk < end_disk_index)
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
- else if (disk > end_disk_index)
+ else if (compare_disk > end_disk_index)
dev_end = last_stripe_index * mddev->chunk_sectors;
else
dev_end = end_disk_offset;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 3816e5477db1..8cc761ca7423 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -6,6 +6,7 @@ struct strip_zone {
sector_t zone_end; /* Start of the next zone (in sectors) */
sector_t dev_start; /* Zone offset in real dev (in sectors) */
int nb_dev; /* # of devices attached to the zone */
+ int disk_shift; /* start disk for the original layout */
};
/* Linux 3.14 (20d0189b101) made an unintended change to
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e87184645c54..c40237cfdcb0 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1810,6 +1810,9 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
int number = rdev->raid_disk;
struct raid1_info *p = conf->mirrors + number;
+ if (unlikely(number >= conf->raid_disks))
+ goto abort;
+
if (rdev != p->rdev)
p = conf->mirrors + conf->raid_disks + number;
@@ -3132,6 +3135,7 @@ static int raid1_run(struct mddev *mddev)
* RAID1 needs at least one disk in active
*/
if (conf->raid_disks - mddev->degraded < 1) {
+ md_unregister_thread(&conf->thread);
ret = -EINVAL;
goto abort;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index deddabfb07d7..3983d5c8b5cd 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -751,8 +751,16 @@ static struct md_rdev *read_balance(struct r10conf *conf,
disk = r10_bio->devs[slot].devnum;
rdev = rcu_dereference(conf->mirrors[disk].replacement);
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
- r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
+ r10_bio->devs[slot].addr + sectors >
+ rdev->recovery_offset) {
+ /*
+ * Read replacement first to prevent reading both rdev
+ * and replacement as NULL during replacement replace
+ * rdev.
+ */
+ smp_mb();
rdev = rcu_dereference(conf->mirrors[disk].rdev);
+ }
if (rdev == NULL ||
test_bit(Faulty, &rdev->flags))
continue;
@@ -919,6 +927,7 @@ static void flush_pending_writes(struct r10conf *conf)
else
generic_make_request(bio);
bio = next;
+ cond_resched();
}
blk_finish_plug(&plug);
} else
@@ -1104,6 +1113,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
else
generic_make_request(bio);
bio = next;
+ cond_resched();
}
kfree(plug);
}
@@ -1363,9 +1373,15 @@ retry_write:
for (i = 0; i < conf->copies; i++) {
int d = r10_bio->devs[i].devnum;
- struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
- struct md_rdev *rrdev = rcu_dereference(
- conf->mirrors[d].replacement);
+ struct md_rdev *rdev, *rrdev;
+
+ rrdev = rcu_dereference(conf->mirrors[d].replacement);
+ /*
+ * Read replacement first to prevent reading both rdev and
+ * replacement as NULL during replacement replace rdev.
+ */
+ smp_mb();
+ rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev == rrdev)
rrdev = NULL;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -1826,9 +1842,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
int err = 0;
int number = rdev->raid_disk;
struct md_rdev **rdevp;
- struct raid10_info *p = conf->mirrors + number;
+ struct raid10_info *p;
print_conf(conf);
+ if (unlikely(number >= mddev->raid_disks))
+ return 0;
+ p = conf->mirrors + number;
if (rdev == p->rdev)
rdevp = &p->rdev;
else if (rdev == p->replacement)
@@ -2226,11 +2245,22 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
{
struct r10conf *conf = mddev->private;
int d;
- struct bio *wbio, *wbio2;
+ struct bio *wbio = r10_bio->devs[1].bio;
+ struct bio *wbio2 = r10_bio->devs[1].repl_bio;
+
+ /* Need to test wbio2->bi_end_io before we call
+ * generic_make_request as if the former is NULL,
+ * the latter is free to free wbio2.
+ */
+ if (wbio2 && !wbio2->bi_end_io)
+ wbio2 = NULL;
if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
fix_recovery_read_error(r10_bio);
- end_sync_request(r10_bio);
+ if (wbio->bi_end_io)
+ end_sync_request(r10_bio);
+ if (wbio2)
+ end_sync_request(r10_bio);
return;
}
@@ -2239,14 +2269,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
* and submit the write request
*/
d = r10_bio->devs[1].devnum;
- wbio = r10_bio->devs[1].bio;
- wbio2 = r10_bio->devs[1].repl_bio;
- /* Need to test wbio2->bi_end_io before we call
- * generic_make_request as if the former is NULL,
- * the latter is free to free wbio2.
- */
- if (wbio2 && !wbio2->bi_end_io)
- wbio2 = NULL;
if (wbio->bi_end_io) {
atomic_inc(&conf->mirrors[d].rdev->nr_pending);
md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
@@ -2914,10 +2936,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
sector_t chunk_mask = conf->geo.chunk_mask;
int page_idx = 0;
- if (!mempool_initialized(&conf->r10buf_pool))
- if (init_resync(conf))
- return 0;
-
/*
* Allow skipping a full rebuild for incremental assembly
* of a clean array, like RAID1 does.
@@ -2933,6 +2951,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
return mddev->dev_sectors - sector_nr;
}
+ if (!mempool_initialized(&conf->r10buf_pool))
+ if (init_resync(conf))
+ return 0;
+
skipped:
max_sector = mddev->dev_sectors;
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
@@ -3048,7 +3070,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
int must_sync;
int any_working;
int need_recover = 0;
- int need_replace = 0;
struct raid10_info *mirror = &conf->mirrors[i];
struct md_rdev *mrdev, *mreplace;
@@ -3060,11 +3081,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
!test_bit(Faulty, &mrdev->flags) &&
!test_bit(In_sync, &mrdev->flags))
need_recover = 1;
- if (mreplace != NULL &&
- !test_bit(Faulty, &mreplace->flags))
- need_replace = 1;
+ if (mreplace && test_bit(Faulty, &mreplace->flags))
+ mreplace = NULL;
- if (!need_recover && !need_replace) {
+ if (!need_recover && !mreplace) {
rcu_read_unlock();
continue;
}
@@ -3080,8 +3100,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
rcu_read_unlock();
continue;
}
- if (mreplace && test_bit(Faulty, &mreplace->flags))
- mreplace = NULL;
/* Unless we are doing a full sync, or a replacement
* we only need to recover the block if it is set in
* the bitmap
@@ -3204,11 +3222,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio = r10_bio->devs[1].repl_bio;
if (bio)
bio->bi_end_io = NULL;
- /* Note: if need_replace, then bio
+ /* Note: if replace is not NULL, then bio
* cannot be NULL as r10buf_pool_alloc will
* have allocated it.
*/
- if (!need_replace)
+ if (!mreplace)
break;
bio->bi_next = biolist;
biolist = bio;
@@ -3629,6 +3647,20 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
return nc*fc;
}
+static void raid10_free_conf(struct r10conf *conf)
+{
+ if (!conf)
+ return;
+
+ mempool_exit(&conf->r10bio_pool);
+ kfree(conf->mirrors);
+ kfree(conf->mirrors_old);
+ kfree(conf->mirrors_new);
+ safe_put_page(conf->tmppage);
+ bioset_exit(&conf->bio_split);
+ kfree(conf);
+}
+
static struct r10conf *setup_conf(struct mddev *mddev)
{
struct r10conf *conf = NULL;
@@ -3711,20 +3743,24 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return conf;
out:
- if (conf) {
- mempool_exit(&conf->r10bio_pool);
- kfree(conf->mirrors);
- safe_put_page(conf->tmppage);
- bioset_exit(&conf->bio_split);
- kfree(conf);
- }
+ raid10_free_conf(conf);
return ERR_PTR(err);
}
+static void raid10_set_io_opt(struct r10conf *conf)
+{
+ int raid_disks = conf->geo.raid_disks;
+
+ if (!(conf->geo.raid_disks % conf->geo.near_copies))
+ raid_disks /= conf->geo.near_copies;
+ blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
+ raid_disks);
+}
+
static int raid10_run(struct mddev *mddev)
{
struct r10conf *conf;
- int i, disk_idx, chunk_size;
+ int i, disk_idx;
struct raid10_info *disk;
struct md_rdev *rdev;
sector_t size;
@@ -3745,6 +3781,9 @@ static int raid10_run(struct mddev *mddev)
if (!conf)
goto out;
+ mddev->thread = conf->thread;
+ conf->thread = NULL;
+
if (mddev_is_clustered(conf->mddev)) {
int fc, fo;
@@ -3757,21 +3796,13 @@ static int raid10_run(struct mddev *mddev)
}
}
- mddev->thread = conf->thread;
- conf->thread = NULL;
-
- chunk_size = mddev->chunk_sectors << 9;
if (mddev->queue) {
blk_queue_max_discard_sectors(mddev->queue,
mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, 0);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
- blk_queue_io_min(mddev->queue, chunk_size);
- if (conf->geo.raid_disks % conf->geo.near_copies)
- blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
- else
- blk_queue_io_opt(mddev->queue, chunk_size *
- (conf->geo.raid_disks / conf->geo.near_copies));
+ blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+ raid10_set_io_opt(conf);
}
rdev_for_each(rdev, mddev) {
@@ -3931,10 +3962,7 @@ static int raid10_run(struct mddev *mddev)
out_free_conf:
md_unregister_thread(&mddev->thread);
- mempool_exit(&conf->r10bio_pool);
- safe_put_page(conf->tmppage);
- kfree(conf->mirrors);
- kfree(conf);
+ raid10_free_conf(conf);
mddev->private = NULL;
out:
return -EIO;
@@ -3942,15 +3970,7 @@ out:
static void raid10_free(struct mddev *mddev, void *priv)
{
- struct r10conf *conf = priv;
-
- mempool_exit(&conf->r10bio_pool);
- safe_put_page(conf->tmppage);
- kfree(conf->mirrors);
- kfree(conf->mirrors_old);
- kfree(conf->mirrors_new);
- bioset_exit(&conf->bio_split);
- kfree(conf);
+ raid10_free_conf(priv);
}
static void raid10_quiesce(struct mddev *mddev, int quiesce)
@@ -4745,6 +4765,7 @@ static void end_reshape(struct r10conf *conf)
stripe /= conf->geo.near_copies;
if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
+ raid10_set_io_opt(conf);
}
conf->fullsync = 0;
}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 474cf6abefea..f3d60c4b34b8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,6 +36,7 @@
*/
#include <linux/blkdev.h>
+#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
@@ -2598,7 +2599,7 @@ static void raid5_end_write_request(struct bio *bi)
struct stripe_head *sh = bi->bi_private;
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
- struct md_rdev *uninitialized_var(rdev);
+ struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int replacement = 0;
@@ -2666,10 +2667,10 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
- raid5_release_stripe(sh);
if (sh->batch_head && sh != sh->batch_head)
raid5_release_stripe(sh->batch_head);
+ raid5_release_stripe(sh);
}
static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
@@ -3728,7 +3729,7 @@ static void handle_stripe_fill(struct stripe_head *sh,
* back cache (prexor with orig_page, and then xor with
* page) in the read path
*/
- if (s->injournal && s->failed) {
+ if (s->to_read && s->injournal && s->failed) {
if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh);
goto out;
@@ -6334,7 +6335,18 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock);
+
+ /*
+ * Waiting on MD_SB_CHANGE_PENDING below may deadlock
+ * seeing md_check_recovery() is needed to clear
+ * the flag when using mdmon.
+ */
+ continue;
}
+
+ wait_event_lock_irq(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+ conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -7147,6 +7159,12 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
return 0;
}
+static void raid5_set_io_opt(struct r5conf *conf)
+{
+ blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
+ (conf->raid_disks - conf->max_degraded));
+}
+
static int raid5_run(struct mddev *mddev)
{
struct r5conf *conf;
@@ -7436,8 +7454,7 @@ static int raid5_run(struct mddev *mddev)
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
- blk_queue_io_opt(mddev->queue, chunk_size *
- (conf->raid_disks - conf->max_degraded));
+ raid5_set_io_opt(conf);
mddev->queue->limits.raid_partial_stripes_expensive = 1;
/*
* We can only discard a whole stripe. It doesn't make sense to
@@ -8031,6 +8048,7 @@ static void end_reshape(struct r5conf *conf)
/ PAGE_SIZE);
if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe)
conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe;
+ raid5_set_io_opt(conf);
}
}
}