aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/dm-bufio.c7
-rw-r--r--drivers/md/dm-cache-metadata.c107
-rw-r--r--drivers/md/dm-cache-metadata.h9
-rw-r--r--drivers/md/dm-cache-target.c85
-rw-r--r--drivers/md/dm-crypt.c81
-rw-r--r--drivers/md/dm-io.c22
-rw-r--r--drivers/md/dm-log-userspace-transfer.c2
-rw-r--r--drivers/md/dm-thin-metadata.c89
-rw-r--r--drivers/md/dm-thin.c34
-rw-r--r--drivers/md/dm-verity.c15
-rw-r--r--drivers/md/dm.c15
-rw-r--r--drivers/md/md.c21
-rw-r--r--drivers/md/persistent-data/dm-block-manager.c15
-rw-r--r--drivers/md/persistent-data/dm-block-manager.h3
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.c5
-rw-r--r--drivers/md/persistent-data/dm-transaction-manager.h17
-rw-r--r--drivers/md/raid1.c65
-rw-r--r--drivers/md/raid10.c27
-rw-r--r--drivers/md/raid5.c20
19 files changed, 338 insertions, 301 deletions
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 66c5d130c8c2..ca1621b49453 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -465,6 +465,7 @@ static void __relink_lru(struct dm_buffer *b, int dirty)
c->n_buffers[dirty]++;
b->list_mode = dirty;
list_move(&b->lru_list, &c->lru[dirty]);
+ b->last_accessed = jiffies;
}
/*----------------------------------------------------------------
@@ -1485,9 +1486,9 @@ static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
freed += __cleanup_old_buffer(b, gfp_mask, 0);
if (!--nr_to_scan)
- break;
+ return freed;
+ dm_bufio_cond_resched();
}
- dm_bufio_cond_resched();
}
return freed;
}
@@ -1541,7 +1542,7 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
BUG_ON(block_size < 1 << SECTOR_SHIFT ||
(block_size & (block_size - 1)));
- c = kmalloc(sizeof(*c), GFP_KERNEL);
+ c = kzalloc(sizeof(*c), GFP_KERNEL);
if (!c) {
r = -ENOMEM;
goto bad_client;
diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index 9ef0752e8a08..a87d3fab0271 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -120,6 +120,12 @@ struct dm_cache_metadata {
unsigned policy_version[CACHE_POLICY_VERSION_SIZE];
size_t policy_hint_size;
struct dm_cache_statistics stats;
+
+ /*
+ * Reading the space map root can fail, so we read it into this
+ * buffer before the superblock is locked and updated.
+ */
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
};
/*-------------------------------------------------------------------
@@ -260,11 +266,31 @@ static void __setup_mapping_info(struct dm_cache_metadata *cmd)
}
}
+static int __save_sm_root(struct dm_cache_metadata *cmd)
+{
+ int r;
+ size_t metadata_len;
+
+ r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+ if (r < 0)
+ return r;
+
+ return dm_sm_copy_root(cmd->metadata_sm, &cmd->metadata_space_map_root,
+ metadata_len);
+}
+
+static void __copy_sm_root(struct dm_cache_metadata *cmd,
+ struct cache_disk_superblock *disk_super)
+{
+ memcpy(&disk_super->metadata_space_map_root,
+ &cmd->metadata_space_map_root,
+ sizeof(cmd->metadata_space_map_root));
+}
+
static int __write_initial_superblock(struct dm_cache_metadata *cmd)
{
int r;
struct dm_block *sblock;
- size_t metadata_len;
struct cache_disk_superblock *disk_super;
sector_t bdev_size = i_size_read(cmd->bdev->bd_inode) >> SECTOR_SHIFT;
@@ -272,12 +298,16 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
if (bdev_size > DM_CACHE_METADATA_MAX_SECTORS)
bdev_size = DM_CACHE_METADATA_MAX_SECTORS;
- r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
+ r = dm_tm_pre_commit(cmd->tm);
if (r < 0)
return r;
- r = dm_tm_pre_commit(cmd->tm);
- if (r < 0)
+ /*
+ * dm_sm_copy_root() can fail. So we need to do it before we start
+ * updating the superblock.
+ */
+ r = __save_sm_root(cmd);
+ if (r)
return r;
r = superblock_lock_zero(cmd, &sblock);
@@ -293,10 +323,7 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
memset(disk_super->policy_version, 0, sizeof(disk_super->policy_version));
disk_super->policy_hint_size = 0;
- r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
- metadata_len);
- if (r < 0)
- goto bad_locked;
+ __copy_sm_root(cmd, disk_super);
disk_super->mapping_root = cpu_to_le64(cmd->root);
disk_super->hint_root = cpu_to_le64(cmd->hint_root);
@@ -313,10 +340,6 @@ static int __write_initial_superblock(struct dm_cache_metadata *cmd)
disk_super->write_misses = cpu_to_le32(0);
return dm_tm_commit(cmd->tm, sblock);
-
-bad_locked:
- dm_bm_unlock(sblock);
- return r;
}
static int __format_metadata(struct dm_cache_metadata *cmd)
@@ -402,6 +425,15 @@ static int __open_metadata(struct dm_cache_metadata *cmd)
disk_super = dm_block_data(sblock);
+ /* Verify the data block size hasn't changed */
+ if (le32_to_cpu(disk_super->data_block_size) != cmd->data_block_size) {
+ DMERR("changing the data block size (from %u to %llu) is not supported",
+ le32_to_cpu(disk_super->data_block_size),
+ (unsigned long long)cmd->data_block_size);
+ r = -EINVAL;
+ goto bad;
+ }
+
r = __check_incompat_features(disk_super, cmd);
if (r < 0)
goto bad;
@@ -530,8 +562,9 @@ static int __begin_transaction_flags(struct dm_cache_metadata *cmd,
disk_super = dm_block_data(sblock);
update_flags(disk_super, mutator);
read_superblock_fields(cmd, disk_super);
+ dm_bm_unlock(sblock);
- return dm_bm_flush_and_unlock(cmd->bm, sblock);
+ return dm_bm_flush(cmd->bm);
}
static int __begin_transaction(struct dm_cache_metadata *cmd)
@@ -559,7 +592,6 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
flags_mutator mutator)
{
int r;
- size_t metadata_len;
struct cache_disk_superblock *disk_super;
struct dm_block *sblock;
@@ -577,8 +609,8 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
if (r < 0)
return r;
- r = dm_sm_root_size(cmd->metadata_sm, &metadata_len);
- if (r < 0)
+ r = __save_sm_root(cmd);
+ if (r)
return r;
r = superblock_lock(cmd, &sblock);
@@ -605,13 +637,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
disk_super->read_misses = cpu_to_le32(cmd->stats.read_misses);
disk_super->write_hits = cpu_to_le32(cmd->stats.write_hits);
disk_super->write_misses = cpu_to_le32(cmd->stats.write_misses);
-
- r = dm_sm_copy_root(cmd->metadata_sm, &disk_super->metadata_space_map_root,
- metadata_len);
- if (r < 0) {
- dm_bm_unlock(sblock);
- return r;
- }
+ __copy_sm_root(cmd, disk_super);
return dm_tm_commit(cmd->tm, sblock);
}
@@ -1228,22 +1254,12 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po
return 0;
}
-int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint)
{
+ struct dm_cache_metadata *cmd = context;
+ __le32 value = cpu_to_le32(hint);
int r;
- down_write(&cmd->root_lock);
- r = begin_hints(cmd, policy);
- up_write(&cmd->root_lock);
-
- return r;
-}
-
-static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
- uint32_t hint)
-{
- int r;
- __le32 value = cpu_to_le32(hint);
__dm_bless_for_disk(&value);
r = dm_array_set_value(&cmd->hint_info, cmd->hint_root,
@@ -1253,16 +1269,25 @@ static int save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
return r;
}
-int dm_cache_save_hint(struct dm_cache_metadata *cmd, dm_cblock_t cblock,
- uint32_t hint)
+static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
{
int r;
- if (!hints_array_initialized(cmd))
- return 0;
+ r = begin_hints(cmd, policy);
+ if (r) {
+ DMERR("begin_hints failed");
+ return r;
+ }
+
+ return policy_walk_mappings(policy, save_hint, cmd);
+}
+
+int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy)
+{
+ int r;
down_write(&cmd->root_lock);
- r = save_hint(cmd, cblock, hint);
+ r = write_hints(cmd, policy);
up_write(&cmd->root_lock);
return r;
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index cd906f14f98d..f0fb1dd26524 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -128,14 +128,7 @@ void dm_cache_dump(struct dm_cache_metadata *cmd);
* rather than querying the policy for each cblock, we let it walk its data
* structures and fill in the hints in whatever order it wishes.
*/
-
-int dm_cache_begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
-
-/*
- * requests hints for every cblock and stores in the metadata device.
- */
-int dm_cache_save_hint(struct dm_cache_metadata *cmd,
- dm_cblock_t cblock, uint32_t hint);
+int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *p);
/*
* Query method. Are all the blocks in the cache clean?
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 074b9c8e4cf0..2331543005b2 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -231,7 +231,7 @@ struct cache {
/*
* cache_size entries, dirty if set
*/
- dm_cblock_t nr_dirty;
+ atomic_t nr_dirty;
unsigned long *dirty_bitset;
/*
@@ -239,7 +239,7 @@ struct cache {
*/
dm_dblock_t discard_nr_blocks;
unsigned long *discard_bitset;
- uint32_t discard_block_size; /* a power of 2 times sectors per block */
+ uint32_t discard_block_size;
/*
* Rather than reconstructing the table line for the status we just
@@ -493,7 +493,7 @@ static bool is_dirty(struct cache *cache, dm_cblock_t b)
static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
{
if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
- cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
+ atomic_inc(&cache->nr_dirty);
policy_set_dirty(cache->policy, oblock);
}
}
@@ -502,8 +502,7 @@ static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cbl
{
if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
policy_clear_dirty(cache->policy, oblock);
- cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
- if (!from_cblock(cache->nr_dirty))
+ if (atomic_dec_return(&cache->nr_dirty) == 0)
dm_table_event(cache->ti->table);
}
}
@@ -891,8 +890,8 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
struct cache *cache = mg->cache;
if (mg->writeback) {
- cell_defer(cache, mg->old_ocell, false);
clear_dirty(cache, mg->old_oblock, mg->cblock);
+ cell_defer(cache, mg->old_ocell, false);
cleanup_migration(mg);
return;
@@ -947,13 +946,13 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
}
} else {
+ clear_dirty(cache, mg->new_oblock, mg->cblock);
if (mg->requeue_holder)
cell_defer(cache, mg->new_ocell, true);
else {
bio_endio(mg->new_ocell->holder, 0);
cell_defer(cache, mg->new_ocell, false);
}
- clear_dirty(cache, mg->new_oblock, mg->cblock);
cleanup_migration(mg);
}
}
@@ -2171,35 +2170,6 @@ static int create_cache_policy(struct cache *cache, struct cache_args *ca,
return 0;
}
-/*
- * We want the discard block size to be a power of two, at least the size
- * of the cache block size, and have no more than 2^14 discard blocks
- * across the origin.
- */
-#define MAX_DISCARD_BLOCKS (1 << 14)
-
-static bool too_many_discard_blocks(sector_t discard_block_size,
- sector_t origin_size)
-{
- (void) sector_div(origin_size, discard_block_size);
-
- return origin_size > MAX_DISCARD_BLOCKS;
-}
-
-static sector_t calculate_discard_block_size(sector_t cache_block_size,
- sector_t origin_size)
-{
- sector_t discard_block_size;
-
- discard_block_size = roundup_pow_of_two(cache_block_size);
-
- if (origin_size)
- while (too_many_discard_blocks(discard_block_size, origin_size))
- discard_block_size *= 2;
-
- return discard_block_size;
-}
-
#define DEFAULT_MIGRATION_THRESHOLD 2048
static int cache_create(struct cache_args *ca, struct cache **result)
@@ -2224,6 +2194,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
ti->num_discard_bios = 1;
ti->discards_supported = true;
ti->discard_zeroes_data_unsupported = true;
+ /* Discard bios must be split on a block boundary */
+ ti->split_discard_bios = true;
cache->features = ca->features;
ti->per_bio_data_size = get_per_bio_data_size(cache);
@@ -2313,7 +2285,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
atomic_set(&cache->quiescing_ack, 0);
r = -ENOMEM;
- cache->nr_dirty = 0;
+ atomic_set(&cache->nr_dirty, 0);
cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
if (!cache->dirty_bitset) {
*error = "could not allocate dirty bitset";
@@ -2321,9 +2293,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
}
clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
- cache->discard_block_size =
- calculate_discard_block_size(cache->sectors_per_block,
- cache->origin_sectors);
+ cache->discard_block_size = cache->sectors_per_block;
cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
if (!cache->discard_bitset) {
@@ -2537,6 +2507,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
} else {
inc_hit_counter(cache, bio);
+ pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
!is_dirty(cache, lookup_result.cblock))
@@ -2631,30 +2602,6 @@ static int write_discard_bitset(struct cache *cache)
return 0;
}
-static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
- uint32_t hint)
-{
- struct cache *cache = context;
- return dm_cache_save_hint(cache->cmd, cblock, hint);
-}
-
-static int write_hints(struct cache *cache)
-{
- int r;
-
- r = dm_cache_begin_hints(cache->cmd, cache->policy);
- if (r) {
- DMERR("dm_cache_begin_hints failed");
- return r;
- }
-
- r = policy_walk_mappings(cache->policy, save_hint, cache);
- if (r)
- DMERR("policy_walk_mappings failed");
-
- return r;
-}
-
/*
* returns true on success
*/
@@ -2672,7 +2619,7 @@ static bool sync_metadata(struct cache *cache)
save_stats(cache);
- r3 = write_hints(cache);
+ r3 = dm_cache_write_hints(cache->cmd, cache->policy);
if (r3)
DMERR("could not write hints");
@@ -2880,7 +2827,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
residency = policy_residency(cache->policy);
- DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %llu ",
+ DMEMIT("%u %llu/%llu %u %llu/%llu %u %u %u %u %u %u %lu ",
(unsigned)(DM_CACHE_METADATA_BLOCK_SIZE >> SECTOR_SHIFT),
(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
(unsigned long long)nr_blocks_metadata,
@@ -2893,7 +2840,7 @@ static void cache_status(struct dm_target *ti, status_type_t type,
(unsigned) atomic_read(&cache->stats.write_miss),
(unsigned) atomic_read(&cache->stats.demotion),
(unsigned) atomic_read(&cache->stats.promotion),
- (unsigned long long) from_cblock(cache->nr_dirty));
+ (unsigned long) atomic_read(&cache->nr_dirty));
if (writethrough_mode(&cache->features))
DMEMIT("1 writethrough ");
@@ -3120,7 +3067,7 @@ static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
/*
* FIXME: these limits may be incompatible with the cache device
*/
- limits->max_discard_sectors = cache->discard_block_size * 1024;
+ limits->max_discard_sectors = cache->discard_block_size;
limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
}
@@ -3145,7 +3092,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
- .version = {1, 3, 0},
+ .version = {1, 4, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 784695d22fde..9533f835ce07 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -19,7 +19,6 @@
#include <linux/crypto.h>
#include <linux/workqueue.h>
#include <linux/backing-dev.h>
-#include <linux/percpu.h>
#include <linux/atomic.h>
#include <linux/scatterlist.h>
#include <asm/page.h>
@@ -43,6 +42,7 @@ struct convert_context {
struct bvec_iter iter_out;
sector_t cc_sector;
atomic_t cc_pending;
+ struct ablkcipher_request *req;
};
/*
@@ -111,15 +111,7 @@ struct iv_tcw_private {
enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID };
/*
- * Duplicated per-CPU state for cipher.
- */
-struct crypt_cpu {
- struct ablkcipher_request *req;
-};
-
-/*
- * The fields in here must be read only after initialization,
- * changing state should be in crypt_cpu.
+ * The fields in here must be read only after initialization.
*/
struct crypt_config {
struct dm_dev *dev;
@@ -150,12 +142,6 @@ struct crypt_config {
sector_t iv_offset;
unsigned int iv_size;
- /*
- * Duplicated per cpu state. Access through
- * per_cpu_ptr() only.
- */
- struct crypt_cpu __percpu *cpu;
-
/* ESSIV: struct crypto_cipher *essiv_tfm */
void *iv_private;
struct crypto_ablkcipher **tfms;
@@ -192,11 +178,6 @@ static void clone_init(struct dm_crypt_io *, struct bio *);
static void kcryptd_queue_crypt(struct dm_crypt_io *io);
static u8 *iv_of_dmreq(struct crypt_config *cc, struct dm_crypt_request *dmreq);
-static struct crypt_cpu *this_crypt_config(struct crypt_config *cc)
-{
- return this_cpu_ptr(cc->cpu);
-}
-
/*
* Use this to access cipher attributes that are the same for each CPU.
*/
@@ -903,16 +884,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
static void crypt_alloc_req(struct crypt_config *cc,
struct convert_context *ctx)
{
- struct crypt_cpu *this_cc = this_crypt_config(cc);
unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1);
- if (!this_cc->req)
- this_cc->req = mempool_alloc(cc->req_pool, GFP_NOIO);
+ if (!ctx->req)
+ ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
- ablkcipher_request_set_tfm(this_cc->req, cc->tfms[key_index]);
- ablkcipher_request_set_callback(this_cc->req,
+ ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+ ablkcipher_request_set_callback(ctx->req,
CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
- kcryptd_async_done, dmreq_of_req(cc, this_cc->req));
+ kcryptd_async_done, dmreq_of_req(cc, ctx->req));
}
/*
@@ -921,7 +901,6 @@ static void crypt_alloc_req(struct crypt_config *cc,
static int crypt_convert(struct crypt_config *cc,
struct convert_context *ctx)
{
- struct crypt_cpu *this_cc = this_crypt_config(cc);
int r;
atomic_set(&ctx->cc_pending, 1);
@@ -932,7 +911,7 @@ static int crypt_convert(struct crypt_config *cc,
atomic_inc(&ctx->cc_pending);
- r = crypt_convert_block(cc, ctx, this_cc->req);
+ r = crypt_convert_block(cc, ctx, ctx->req);
switch (r) {
/* async */
@@ -941,7 +920,7 @@ static int crypt_convert(struct crypt_config *cc,
reinit_completion(&ctx->restart);
/* fall through*/
case -EINPROGRESS:
- this_cc->req = NULL;
+ ctx->req = NULL;
ctx->cc_sector++;
continue;
@@ -1040,6 +1019,7 @@ static struct dm_crypt_io *crypt_io_alloc(struct crypt_config *cc,
io->sector = sector;
io->error = 0;
io->base_io = NULL;
+ io->ctx.req = NULL;
atomic_set(&io->io_pending, 0);
return io;
@@ -1065,6 +1045,8 @@ static void crypt_dec_pending(struct dm_crypt_io *io)
if (!atomic_dec_and_test(&io->io_pending))
return;
+ if (io->ctx.req)
+ mempool_free(io->ctx.req, cc->req_pool);
mempool_free(io, cc->io_pool);
if (likely(!base_io))
@@ -1492,8 +1474,6 @@ static int crypt_wipe_key(struct crypt_config *cc)
static void crypt_dtr(struct dm_target *ti)
{
struct crypt_config *cc = ti->private;
- struct crypt_cpu *cpu_cc;
- int cpu;
ti->private = NULL;
@@ -1505,13 +1485,6 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->crypt_queue)
destroy_workqueue(cc->crypt_queue);
- if (cc->cpu)
- for_each_possible_cpu(cpu) {
- cpu_cc = per_cpu_ptr(cc->cpu, cpu);
- if (cpu_cc->req)
- mempool_free(cpu_cc->req, cc->req_pool);
- }
-
crypt_free_tfms(cc);
if (cc->bs)
@@ -1530,9 +1503,6 @@ static void crypt_dtr(struct dm_target *ti)
if (cc->dev)
dm_put_device(ti, cc->dev);
- if (cc->cpu)
- free_percpu(cc->cpu);
-
kzfree(cc->cipher);
kzfree(cc->cipher_string);
@@ -1588,13 +1558,6 @@ static int crypt_ctr_cipher(struct dm_target *ti,
if (tmp)
DMWARN("Ignoring unexpected additional cipher options");
- cc->cpu = __alloc_percpu(sizeof(*(cc->cpu)),
- __alignof__(struct crypt_cpu));
- if (!cc->cpu) {
- ti->error = "Cannot allocate per cpu state";
- goto bad_mem;
- }
-
/*
* For compatibility with the original dm-crypt mapping format, if
* only the cipher name is supplied, use cbc-plain.
@@ -1718,6 +1681,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
unsigned int key_size, opt_params;
unsigned long long tmpll;
int ret;
+ size_t iv_size_padding;
struct dm_arg_set as;
const char *opt_string;
char dummy;
@@ -1754,12 +1718,23 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->dmreq_start = sizeof(struct ablkcipher_request);
cc->dmreq_start += crypto_ablkcipher_reqsize(any_tfm(cc));
- cc->dmreq_start = ALIGN(cc->dmreq_start, crypto_tfm_ctx_alignment());
- cc->dmreq_start += crypto_ablkcipher_alignmask(any_tfm(cc)) &
- ~(crypto_tfm_ctx_alignment() - 1);
+ cc->dmreq_start = ALIGN(cc->dmreq_start, __alignof__(struct dm_crypt_request));
+
+ if (crypto_ablkcipher_alignmask(any_tfm(cc)) < CRYPTO_MINALIGN) {
+ /* Allocate the padding exactly */
+ iv_size_padding = -(cc->dmreq_start + sizeof(struct dm_crypt_request))
+ & crypto_ablkcipher_alignmask(any_tfm(cc));
+ } else {
+ /*
+ * If the cipher requires greater alignment than kmalloc
+ * alignment, we don't know the exact position of the
+ * initialization vector. We must assume worst case.
+ */
+ iv_size_padding = crypto_ablkcipher_alignmask(any_tfm(cc));
+ }
cc->req_pool = mempool_create_kmalloc_pool(MIN_IOS, cc->dmreq_start +
- sizeof(struct dm_crypt_request) + cc->iv_size);
+ sizeof(struct dm_crypt_request) + iv_size_padding + cc->iv_size);
if (!cc->req_pool) {
ti->error = "Cannot allocate crypt request mempool";
goto bad;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3842ac738f98..db404a0f7e2c 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -10,6 +10,7 @@
#include <linux/device-mapper.h>
#include <linux/bio.h>
+#include <linux/completion.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -32,7 +33,7 @@ struct dm_io_client {
struct io {
unsigned long error_bits;
atomic_t count;
- struct task_struct *sleeper;
+ struct completion *wait;
struct dm_io_client *client;
io_notify_fn callback;
void *context;
@@ -121,8 +122,8 @@ static void dec_count(struct io *io, unsigned int region, int error)
invalidate_kernel_vmap_range(io->vma_invalidate_address,
io->vma_invalidate_size);
- if (io->sleeper)
- wake_up_process(io->sleeper);
+ if (io->wait)
+ complete(io->wait);
else {
unsigned long r = io->error_bits;
@@ -387,6 +388,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
*/
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
+ DECLARE_COMPLETION_ONSTACK(wait);
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
@@ -395,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->sleeper = current;
+ io->wait = &wait;
io->client = client;
io->vma_invalidate_address = dp->vma_invalidate_address;
@@ -403,15 +405,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
dispatch_io(rw, num_regions, where, dp, io, 1);
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (!atomic_read(&io->count))
- break;
-
- io_schedule();
- }
- set_current_state(TASK_RUNNING);
+ wait_for_completion_io(&wait);
if (error_bits)
*error_bits = io->error_bits;
@@ -434,7 +428,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->sleeper = NULL;
+ io->wait = NULL;
io->client = client;
io->callback = fn;
io->context = context;
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 08d9a207259a..c69d0b787746 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -272,7 +272,7 @@ int dm_ulog_tfr_init(void)
r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
if (r) {
- cn_del_callback(&ulog_cn_id);
+ kfree(prealloced_cn_msg);
return r;
}
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index fb9efc829182..e9d33ad59df5 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -192,6 +192,13 @@ struct dm_pool_metadata {
* operation possible in this state is the closing of the device.
*/
bool fail_io:1;
+
+ /*
+ * Reading the space map roots can fail, so we read it into these
+ * buffers before the superblock is locked and updated.
+ */
+ __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
+ __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
};
struct dm_thin_device {
@@ -431,26 +438,53 @@ static void __setup_btree_details(struct dm_pool_metadata *pmd)
pmd->details_info.value_type.equal = NULL;
}
+static int save_sm_roots(struct dm_pool_metadata *pmd)
+{
+ int r;
+ size_t len;
+
+ r = dm_sm_root_size(pmd->metadata_sm, &len);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
+ if (r < 0)
+ return r;
+
+ r = dm_sm_root_size(pmd->data_sm, &len);
+ if (r < 0)
+ return r;
+
+ return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
+}
+
+static void copy_sm_roots(struct dm_pool_metadata *pmd,
+ struct thin_disk_superblock *disk)
+{
+ memcpy(&disk->metadata_space_map_root,
+ &pmd->metadata_space_map_root,
+ sizeof(pmd->metadata_space_map_root));
+
+ memcpy(&disk->data_space_map_root,
+ &pmd->data_space_map_root,
+ sizeof(pmd->data_space_map_root));
+}
+
static int __write_initial_superblock(struct dm_pool_metadata *pmd)
{
int r;
struct dm_block *sblock;
- size_t metadata_len, data_len;
struct thin_disk_superblock *disk_super;
sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
if (bdev_size > THIN_METADATA_MAX_SECTORS)
bdev_size = THIN_METADATA_MAX_SECTORS;
- r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
- if (r < 0)
- return r;
-
- r = dm_sm_root_size(pmd->data_sm, &data_len);
+ r = dm_sm_commit(pmd->data_sm);
if (r < 0)
return r;
- r = dm_sm_commit(pmd->data_sm);
+ r = save_sm_roots(pmd);
if (r < 0)
return r;
@@ -471,15 +505,7 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
disk_super->trans_id = 0;
disk_super->held_root = 0;
- r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
- metadata_len);
- if (r < 0)
- goto bad_locked;
-
- r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
- data_len);
- if (r < 0)
- goto bad_locked;
+ copy_sm_roots(pmd, disk_super);
disk_super->data_mapping_root = cpu_to_le64(pmd->root);
disk_super->device_details_root = cpu_to_le64(pmd->details_root);
@@ -488,10 +514,6 @@ static int __write_initial_superblock(struct dm_pool_metadata *pmd)
disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
return dm_tm_commit(pmd->tm, sblock);
-
-bad_locked:
- dm_bm_unlock(sblock);
- return r;
}
static int __format_metadata(struct dm_pool_metadata *pmd)
@@ -591,6 +613,15 @@ static int __open_metadata(struct dm_pool_metadata *pmd)
disk_super = dm_block_data(sblock);
+ /* Verify the data block size hasn't changed */
+ if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
+ DMERR("changing the data block size (from %u to %llu) is not supported",
+ le32_to_cpu(disk_super->data_block_size),
+ (unsigned long long)pmd->data_block_size);
+ r = -EINVAL;
+ goto bad_unlock_sblock;
+ }
+
r = __check_incompat_features(disk_super, pmd);
if (r < 0)
goto bad_unlock_sblock;
@@ -769,6 +800,10 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
if (r < 0)
return r;
+ r = save_sm_roots(pmd);
+ if (r < 0)
+ return r;
+
r = superblock_lock(pmd, &sblock);
if (r)
return r;
@@ -780,21 +815,9 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
disk_super->trans_id = cpu_to_le64(pmd->trans_id);
disk_super->flags = cpu_to_le32(pmd->flags);
- r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root,
- metadata_len);
- if (r < 0)
- goto out_locked;
-
- r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root,
- data_len);
- if (r < 0)
- goto out_locked;
+ copy_sm_roots(pmd, disk_super);
return dm_tm_commit(pmd->tm, sblock);
-
-out_locked:
- dm_bm_unlock(sblock);
- return r;
}
struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index be70d38745f7..359af3a519b5 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -25,6 +25,9 @@
#define MAPPING_POOL_SIZE 1024
#define PRISON_CELLS 1024
#define COMMIT_PERIOD HZ
+#define NO_SPACE_TIMEOUT_SECS 60
+
+static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
"A percentage of time allocated for copy on write");
@@ -173,6 +176,7 @@ struct pool {
struct workqueue_struct *wq;
struct work_struct worker;
struct delayed_work waker;
+ struct delayed_work no_space_timeout;
unsigned long last_commit_jiffies;
unsigned ref_count;
@@ -920,7 +924,7 @@ static int commit(struct pool *pool)
{
int r;
- if (get_pool_mode(pool) != PM_WRITE)
+ if (get_pool_mode(pool) >= PM_READ_ONLY)
return -EINVAL;
r = dm_pool_commit_metadata(pool->pmd);
@@ -1392,9 +1396,9 @@ static void process_deferred_bios(struct pool *pool)
*/
if (ensure_next_mapping(pool)) {
spin_lock_irqsave(&pool->lock, flags);
+ bio_list_add(&pool->deferred_bios, bio);
bio_list_merge(&pool->deferred_bios, &bios);
spin_unlock_irqrestore(&pool->lock, flags);
-
break;
}
@@ -1449,6 +1453,20 @@ static void do_waker(struct work_struct *ws)
queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
}
+/*
+ * We're holding onto IO to allow userland time to react. After the
+ * timeout either the pool will have been resized (and thus back in
+ * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ */
+static void do_no_space_timeout(struct work_struct *ws)
+{
+ struct pool *pool = container_of(to_delayed_work(ws), struct pool,
+ no_space_timeout);
+
+ if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
+ set_pool_mode(pool, PM_READ_ONLY);
+}
+
/*----------------------------------------------------------------*/
struct noflush_work {
@@ -1513,6 +1531,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
struct pool_c *pt = pool->ti->private;
bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
enum pool_mode old_mode = get_pool_mode(pool);
+ unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
/*
* Never allow the pool to transition to PM_WRITE mode if user
@@ -1574,6 +1593,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
pool->process_discard = process_discard;
pool->process_prepared_mapping = process_prepared_mapping;
pool->process_prepared_discard = process_prepared_discard_passdown;
+
+ if (!pool->pf.error_if_no_space && no_space_timeout)
+ queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
break;
case PM_WRITE:
@@ -1956,6 +1978,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
INIT_WORK(&pool->worker, do_worker);
INIT_DELAYED_WORK(&pool->waker, do_waker);
+ INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
spin_lock_init(&pool->lock);
bio_list_init(&pool->deferred_bios);
bio_list_init(&pool->deferred_flush_bios);
@@ -2519,6 +2542,7 @@ static void pool_postsuspend(struct dm_target *ti)
struct pool *pool = pt->pool;
cancel_delayed_work(&pool->waker);
+ cancel_delayed_work(&pool->no_space_timeout);
flush_workqueue(pool->wq);
(void) commit(pool);
}
@@ -2901,7 +2925,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
*/
if (pt->adjusted_pf.discard_passdown) {
data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
- limits->discard_granularity = data_limits->discard_granularity;
+ limits->discard_granularity = max(data_limits->discard_granularity,
+ pool->sectors_per_block << SECTOR_SHIFT);
} else
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
}
@@ -3305,6 +3330,9 @@ static void dm_thin_exit(void)
module_init(dm_thin_init);
module_exit(dm_thin_exit);
+module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
+
MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 796007a5e0e1..7a7bab8947ae 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -330,15 +330,17 @@ test_block_hash:
return r;
}
}
-
todo = 1 << v->data_dev_block_bits;
- while (io->iter.bi_size) {
+ do {
u8 *page;
+ unsigned len;
struct bio_vec bv = bio_iter_iovec(bio, io->iter);
page = kmap_atomic(bv.bv_page);
- r = crypto_shash_update(desc, page + bv.bv_offset,
- bv.bv_len);
+ len = bv.bv_len;
+ if (likely(len >= todo))
+ len = todo;
+ r = crypto_shash_update(desc, page + bv.bv_offset, len);
kunmap_atomic(page);
if (r < 0) {
@@ -346,8 +348,9 @@ test_block_hash:
return r;
}
- bio_advance_iter(bio, &io->iter, bv.bv_len);
- }
+ bio_advance_iter(bio, &io->iter, len);
+ todo -= len;
+ } while (todo);
if (!v->version) {
r = crypto_shash_update(desc, v->salt, v->salt_size);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8c53b09b9a2c..65ee3a0d4683 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -54,6 +54,8 @@ static void do_deferred_remove(struct work_struct *w);
static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
+static struct workqueue_struct *deferred_remove_workqueue;
+
/*
* For bio-based dm.
* One of these is allocated per bio.
@@ -283,16 +285,24 @@ static int __init local_init(void)
if (r)
goto out_free_rq_tio_cache;
+ deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
+ if (!deferred_remove_workqueue) {
+ r = -ENOMEM;
+ goto out_uevent_exit;
+ }
+
_major = major;
r = register_blkdev(_major, _name);
if (r < 0)
- goto out_uevent_exit;
+ goto out_free_workqueue;
if (!_major)
_major = r;
return 0;
+out_free_workqueue:
+ destroy_workqueue(deferred_remove_workqueue);
out_uevent_exit:
dm_uevent_exit();
out_free_rq_tio_cache:
@@ -306,6 +316,7 @@ out_free_io_cache:
static void local_exit(void)
{
flush_scheduled_work();
+ destroy_workqueue(deferred_remove_workqueue);
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
@@ -414,7 +425,7 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode)
if (atomic_dec_and_test(&md->open_count) &&
(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
- schedule_work(&deferred_remove_work);
+ queue_work(deferred_remove_workqueue, &deferred_remove_work);
dm_put(md);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4ad5cc4e63e8..73aedcb639c0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7395,8 +7395,10 @@ void md_do_sync(struct md_thread *thread)
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
return;
- if (mddev->ro) /* never try to sync a read-only array */
+ if (mddev->ro) {/* never try to sync a read-only array */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return;
+ }
if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
@@ -7503,6 +7505,19 @@ void md_do_sync(struct md_thread *thread)
rdev->recovery_offset < j)
j = rdev->recovery_offset;
rcu_read_unlock();
+
+ /* If there is a bitmap, we need to make sure all
+ * writes that started before we added a spare
+ * complete before we start doing a recovery.
+ * Otherwise the write might complete and (via
+ * bitmap_endwrite) set a bit in the bitmap after the
+ * recovery has checked that bit and skipped that
+ * region.
+ */
+ if (mddev->bitmap) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
}
printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
@@ -7838,6 +7853,7 @@ void md_check_recovery(struct mddev *mddev)
/* There is no thread, but we need to call
* ->spare_active and clear saved_raid_disk
*/
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
@@ -8530,7 +8546,8 @@ static int md_notify_reboot(struct notifier_block *this,
if (mddev_trylock(mddev)) {
if (mddev->pers)
__md_stop_writes(mddev);
- mddev->safemode = 2;
+ if (mddev->persistent)
+ mddev->safemode = 2;
mddev_unlock(mddev);
}
need_delay = 1;
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 455f79279a16..087411c95ffc 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -595,25 +595,14 @@ int dm_bm_unlock(struct dm_block *b)
}
EXPORT_SYMBOL_GPL(dm_bm_unlock);
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
- struct dm_block *superblock)
+int dm_bm_flush(struct dm_block_manager *bm)
{
- int r;
-
if (bm->read_only)
return -EPERM;
- r = dm_bufio_write_dirty_buffers(bm->bufio);
- if (unlikely(r)) {
- dm_bm_unlock(superblock);
- return r;
- }
-
- dm_bm_unlock(superblock);
-
return dm_bufio_write_dirty_buffers(bm->bufio);
}
-EXPORT_SYMBOL_GPL(dm_bm_flush_and_unlock);
+EXPORT_SYMBOL_GPL(dm_bm_flush);
void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
{
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 13cd58e1fe69..1b95dfc17786 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -105,8 +105,7 @@ int dm_bm_unlock(struct dm_block *b);
*
* This method always blocks.
*/
-int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
- struct dm_block *superblock);
+int dm_bm_flush(struct dm_block_manager *bm);
/*
* Request data is prefetched into the cache.
diff --git a/drivers/md/persistent-data/dm-transaction-manager.c b/drivers/md/persistent-data/dm-transaction-manager.c
index 81da1a26042e..3bc30a0ae3d6 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.c
+++ b/drivers/md/persistent-data/dm-transaction-manager.c
@@ -154,7 +154,7 @@ int dm_tm_pre_commit(struct dm_transaction_manager *tm)
if (r < 0)
return r;
- return 0;
+ return dm_bm_flush(tm->bm);
}
EXPORT_SYMBOL_GPL(dm_tm_pre_commit);
@@ -164,8 +164,9 @@ int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root)
return -EWOULDBLOCK;
wipe_shadow_table(tm);
+ dm_bm_unlock(root);
- return dm_bm_flush_and_unlock(tm->bm, root);
+ return dm_bm_flush(tm->bm);
}
EXPORT_SYMBOL_GPL(dm_tm_commit);
diff --git a/drivers/md/persistent-data/dm-transaction-manager.h b/drivers/md/persistent-data/dm-transaction-manager.h
index b5b139076ca5..2772ed2a781a 100644
--- a/drivers/md/persistent-data/dm-transaction-manager.h
+++ b/drivers/md/persistent-data/dm-transaction-manager.h
@@ -38,18 +38,17 @@ struct dm_transaction_manager *dm_tm_create_non_blocking_clone(struct dm_transac
/*
* We use a 2-phase commit here.
*
- * i) In the first phase the block manager is told to start flushing, and
- * the changes to the space map are written to disk. You should interrogate
- * your particular space map to get detail of its root node etc. to be
- * included in your superblock.
+ * i) Make all changes for the transaction *except* for the superblock.
+ * Then call dm_tm_pre_commit() to flush them to disk.
*
- * ii) @root will be committed last. You shouldn't use more than the
- * first 512 bytes of @root if you wish the transaction to survive a power
- * failure. You *must* have a write lock held on @root for both stage (i)
- * and (ii). The commit will drop the write lock.
+ * ii) Lock your superblock. Update. Then call dm_tm_commit() which will
+ * unlock the superblock and flush it. No other blocks should be updated
+ * during this period. Care should be taken to never unlock a partially
+ * updated superblock; perform any operations that could fail *before* you
+ * take the superblock lock.
*/
int dm_tm_pre_commit(struct dm_transaction_manager *tm);
-int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *root);
+int dm_tm_commit(struct dm_transaction_manager *tm, struct dm_block *superblock);
/*
* These methods are the only way to get hold of a writeable block.
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4a6ca1cb2e78..55de4f6f7eaf 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -97,6 +97,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
struct pool_info *pi = data;
struct r1bio *r1_bio;
struct bio *bio;
+ int need_pages;
int i, j;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
@@ -119,15 +120,15 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
* RESYNC_PAGES for each bio.
*/
if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
- j = pi->raid_disks;
+ need_pages = pi->raid_disks;
else
- j = 1;
- while(j--) {
+ need_pages = 1;
+ for (j = 0; j < need_pages; j++) {
bio = r1_bio->bios[j];
bio->bi_vcnt = RESYNC_PAGES;
if (bio_alloc_pages(bio, gfp_flags))
- goto out_free_bio;
+ goto out_free_pages;
}
/* If not user-requests, copy the page pointers to all bios */
if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@ -141,6 +142,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
return r1_bio;
+out_free_pages:
+ while (--j >= 0) {
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, r1_bio->bios[j], i)
+ __free_page(bv->bv_page);
+ }
+
out_free_bio:
while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
@@ -531,11 +540,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
has_nonrot_disk = 0;
choose_next_idle = 0;
- if (conf->mddev->recovery_cp < MaxSector &&
- (this_sector + sectors >= conf->next_resync))
- choose_first = 1;
- else
- choose_first = 0;
+ choose_first = (conf->mddev->recovery_cp < this_sector + sectors);
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
sector_t dist;
@@ -822,7 +827,7 @@ static void flush_pending_writes(struct r1conf *conf)
* there is no normal IO happeing. It must arrange to call
* lower_barrier when the particular background IO completes.
*/
-static void raise_barrier(struct r1conf *conf)
+static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
{
spin_lock_irq(&conf->resync_lock);
@@ -832,6 +837,7 @@ static void raise_barrier(struct r1conf *conf)
/* block any new IO from starting */
conf->barrier++;
+ conf->next_resync = sector_nr;
/* For these conditions we must wait:
* A: while the array is in frozen state
@@ -840,14 +846,17 @@ static void raise_barrier(struct r1conf *conf)
* C: next_resync + RESYNC_SECTORS > start_next_window, meaning
* next resync will reach to the window which normal bios are
* handling.
+ * D: while there are any active requests in the current window.
*/
wait_event_lock_irq(conf->wait_barrier,
!conf->array_frozen &&
conf->barrier < RESYNC_DEPTH &&
+ conf->current_window_requests == 0 &&
(conf->start_next_window >=
conf->next_resync + RESYNC_SECTORS),
conf->resync_lock);
+ conf->nr_pending++;
spin_unlock_irq(&conf->resync_lock);
}
@@ -857,6 +866,7 @@ static void lower_barrier(struct r1conf *conf)
BUG_ON(conf->barrier <= 0);
spin_lock_irqsave(&conf->resync_lock, flags);
conf->barrier--;
+ conf->nr_pending--;
spin_unlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier);
}
@@ -868,12 +878,10 @@ static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
if (conf->array_frozen || !bio)
wait = true;
else if (conf->barrier && bio_data_dir(bio) == WRITE) {
- if (conf->next_resync < RESYNC_WINDOW_SECTORS)
- wait = true;
- else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
- >= bio_end_sector(bio)) ||
- (conf->next_resync + NEXT_NORMALIO_DISTANCE
- <= bio->bi_iter.bi_sector))
+ if ((conf->mddev->curr_resync_completed
+ >= bio_end_sector(bio)) ||
+ (conf->next_resync + NEXT_NORMALIO_DISTANCE
+ <= bio->bi_iter.bi_sector))
wait = false;
else
wait = true;
@@ -910,8 +918,8 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
}
if (bio && bio_data_dir(bio) == WRITE) {
- if (conf->next_resync + NEXT_NORMALIO_DISTANCE
- <= bio->bi_iter.bi_sector) {
+ if (bio->bi_iter.bi_sector >=
+ conf->mddev->curr_resync_completed) {
if (conf->start_next_window == MaxSector)
conf->start_next_window =
conf->next_resync +
@@ -1177,6 +1185,7 @@ read_again:
atomic_read(&bitmap->behind_writes) == 0);
}
r1_bio->read_disk = rdisk;
+ r1_bio->start_next_window = 0;
read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
@@ -1492,12 +1501,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
mddev->degraded++;
set_bit(Faulty, &rdev->flags);
spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * if recovery is running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
} else
set_bit(Faulty, &rdev->flags);
+ /*
+ * if recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
printk(KERN_ALERT
"md/raid1:%s: Disk failure on %s, disabling device.\n"
@@ -1539,8 +1548,13 @@ static void close_sync(struct r1conf *conf)
mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL;
+ spin_lock_irq(&conf->resync_lock);
conf->next_resync = 0;
conf->start_next_window = MaxSector;
+ conf->current_window_requests +=
+ conf->next_window_requests;
+ conf->next_window_requests = 0;
+ spin_unlock_irq(&conf->resync_lock);
}
static int raid1_spare_active(struct mddev *mddev)
@@ -2141,7 +2155,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags))
+ !test_bit(Faulty, &rdev->flags))
r1_sync_page_io(rdev, sect, s,
conf->tmppage, WRITE);
}
@@ -2153,7 +2167,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d--;
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags)) {
+ !test_bit(Faulty, &rdev->flags)) {
if (r1_sync_page_io(rdev, sect, s,
conf->tmppage, READ)) {
atomic_add(s, &rdev->corrected_errors);
@@ -2532,9 +2546,8 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
- raise_barrier(conf);
- conf->next_resync = sector_nr;
+ raise_barrier(conf, sector_nr);
rcu_read_lock();
/*
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 33fc408e5eac..a46124ecafc7 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1172,6 +1172,13 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
int max_sectors;
int sectors;
+ /*
+ * Register the new request and wait if the reconstruction
+ * thread has put up a bar for new requests.
+ * Continue immediately if no resync is active currently.
+ */
+ wait_barrier(conf);
+
sectors = bio_sectors(bio);
while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
bio->bi_iter.bi_sector < conf->reshape_progress &&
@@ -1552,12 +1559,6 @@ static void make_request(struct mddev *mddev, struct bio *bio)
md_write_start(mddev, bio);
- /*
- * Register the new request and wait if the reconstruction
- * thread has put up a bar for new requests.
- * Continue immediately if no resync is active currently.
- */
- wait_barrier(conf);
do {
@@ -1683,13 +1684,12 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
spin_unlock_irqrestore(&conf->device_lock, flags);
return;
}
- if (test_and_clear_bit(In_sync, &rdev->flags)) {
+ if (test_and_clear_bit(In_sync, &rdev->flags))
mddev->degraded++;
- /*
- * if recovery is running, make sure it aborts.
- */
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- }
+ /*
+ * If recovery is running, make sure it aborts.
+ */
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
set_bit(Blocked, &rdev->flags);
set_bit(Faulty, &rdev->flags);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -2953,6 +2953,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
*/
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
end_reshape(conf);
+ close_sync(conf);
return 0;
}
@@ -4410,7 +4411,7 @@ read_more:
read_bio->bi_private = r10_bio;
read_bio->bi_end_io = end_sync_read;
read_bio->bi_rw = READ;
- read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+ read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
read_bio->bi_flags |= 1 << BIO_UPTODATE;
read_bio->bi_vcnt = 0;
read_bio->bi_iter.bi_size = 0;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 16f5c21963db..4913c0690872 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -64,6 +64,10 @@
#define cpu_to_group(cpu) cpu_to_node(cpu)
#define ANY_GROUP NUMA_NO_NODE
+static bool devices_handle_discard_safely = false;
+module_param(devices_handle_discard_safely, bool, 0644);
+MODULE_PARM_DESC(devices_handle_discard_safely,
+ "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
static struct workqueue_struct *raid5_wq;
/*
* Stripe cache
@@ -3779,6 +3783,8 @@ static void handle_stripe(struct stripe_head *sh)
set_bit(R5_Wantwrite, &dev->flags);
if (prexor)
continue;
+ if (s.failed > 1)
+ continue;
if (!test_bit(R5_Insync, &dev->flags) ||
((i == sh->pd_idx || i == sh->qd_idx) &&
s.failed == 0))
@@ -6115,7 +6121,7 @@ static int run(struct mddev *mddev)
mddev->queue->limits.discard_granularity = stripe;
/*
* unaligned part of discard request will be ignored, so can't
- * guarantee discard_zerors_data
+ * guarantee discard_zeroes_data
*/
mddev->queue->limits.discard_zeroes_data = 0;
@@ -6140,6 +6146,18 @@ static int run(struct mddev *mddev)
!bdev_get_queue(rdev->bdev)->
limits.discard_zeroes_data)
discard_supported = false;
+ /* Unfortunately, discard_zeroes_data is not currently
+ * a guarantee - just a hint. So we only allow DISCARD
+ * if the sysadmin has confirmed that only safe devices
+ * are in use by setting a module parameter.
+ */
+ if (!devices_handle_discard_safely) {
+ if (discard_supported) {
+ pr_info("md/raid456: discard support disabled due to uncertainty.\n");
+ pr_info("Set raid456.devices_handle_discard_safely=Y to override.\n");
+ }
+ discard_supported = false;
+ }
}
if (discard_supported &&