aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-iolatency.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-iolatency.c')
-rw-r--r--block/blk-iolatency.c339
1 files changed, 223 insertions, 116 deletions
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 19923f8a029d..2620baa1f699 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -72,6 +72,7 @@
#include <linux/sched/loadavg.h>
#include <linux/sched/signal.h>
#include <trace/events/block.h>
+#include <linux/blk-mq.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
@@ -115,9 +116,22 @@ struct child_latency_info {
atomic_t scale_cookie;
};
+struct percentile_stats {
+ u64 total;
+ u64 missed;
+};
+
+struct latency_stat {
+ union {
+ struct percentile_stats ps;
+ struct blk_rq_stat rqs;
+ };
+};
+
struct iolatency_grp {
struct blkg_policy_data pd;
- struct blk_rq_stat __percpu *stats;
+ struct latency_stat __percpu *stats;
+ struct latency_stat cur_stat;
struct blk_iolatency *blkiolat;
struct rq_depth rq_depth;
struct rq_wait rq_wait;
@@ -132,6 +146,7 @@ struct iolatency_grp {
/* Our current number of IO's for the last summation. */
u64 nr_samples;
+ bool ssd;
struct child_latency_info child_lat;
};
@@ -139,7 +154,7 @@ struct iolatency_grp {
#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC
/*
* These are the constants used to fake the fixed-point moving average
- * calculation just like load average. The call to CALC_LOAD folds
+ * calculation just like load average. The call to calc_load() folds
* (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling
* window size is bucketed to try to approximately calculate average
* latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows
@@ -172,29 +187,101 @@ static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat)
return pd_to_blkg(&iolat->pd);
}
-static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
- wait_queue_entry_t *wait,
- bool first_block)
+static inline void latency_stat_init(struct iolatency_grp *iolat,
+ struct latency_stat *stat)
{
- struct rq_wait *rqw = &iolat->rq_wait;
+ if (iolat->ssd) {
+ stat->ps.total = 0;
+ stat->ps.missed = 0;
+ } else
+ blk_rq_stat_init(&stat->rqs);
+}
+
+static inline void latency_stat_sum(struct iolatency_grp *iolat,
+ struct latency_stat *sum,
+ struct latency_stat *stat)
+{
+ if (iolat->ssd) {
+ sum->ps.total += stat->ps.total;
+ sum->ps.missed += stat->ps.missed;
+ } else
+ blk_rq_stat_sum(&sum->rqs, &stat->rqs);
+}
+
+static inline void latency_stat_record_time(struct iolatency_grp *iolat,
+ u64 req_time)
+{
+ struct latency_stat *stat = get_cpu_ptr(iolat->stats);
+ if (iolat->ssd) {
+ if (req_time >= iolat->min_lat_nsec)
+ stat->ps.missed++;
+ stat->ps.total++;
+ } else
+ blk_rq_stat_add(&stat->rqs, req_time);
+ put_cpu_ptr(stat);
+}
+
+static inline bool latency_sum_ok(struct iolatency_grp *iolat,
+ struct latency_stat *stat)
+{
+ if (iolat->ssd) {
+ u64 thresh = div64_u64(stat->ps.total, 10);
+ thresh = max(thresh, 1ULL);
+ return stat->ps.missed < thresh;
+ }
+ return stat->rqs.mean <= iolat->min_lat_nsec;
+}
+
+static inline u64 latency_stat_samples(struct iolatency_grp *iolat,
+ struct latency_stat *stat)
+{
+ if (iolat->ssd)
+ return stat->ps.total;
+ return stat->rqs.nr_samples;
+}
+
+static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
+ struct latency_stat *stat)
+{
+ int exp_idx;
+
+ if (iolat->ssd)
+ return;
+
+ /*
+ * calc_load() takes in a number stored in fixed point representation.
+ * Because we are using this for IO time in ns, the values stored
+ * are significantly larger than the FIXED_1 denominator (2048).
+ * Therefore, rounding errors in the calculation are negligible and
+ * can be ignored.
+ */
+ exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
+ div64_u64(iolat->cur_win_nsec,
+ BLKIOLATENCY_EXP_BUCKET_SIZE));
+ iolat->lat_avg = calc_load(iolat->lat_avg,
+ iolatency_exp_factors[exp_idx],
+ stat->rqs.mean);
+}
+
+static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
+{
+ atomic_dec(&rqw->inflight);
+ wake_up(&rqw->wait);
+}
- if (first_block && waitqueue_active(&rqw->wait) &&
- rqw->wait.head.next != &wait->entry)
- return false;
+static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
+{
+ struct iolatency_grp *iolat = private_data;
return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
}
static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
struct iolatency_grp *iolat,
- spinlock_t *lock, bool issue_as_root,
+ bool issue_as_root,
bool use_memdelay)
- __releases(lock)
- __acquires(lock)
{
struct rq_wait *rqw = &iolat->rq_wait;
unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
- DEFINE_WAIT(wait);
- bool first_block = true;
if (use_delay)
blkcg_schedule_throttle(rqos->q, use_memdelay);
@@ -211,27 +298,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
return;
}
- if (iolatency_may_queue(iolat, &wait, first_block))
- return;
-
- do {
- prepare_to_wait_exclusive(&rqw->wait, &wait,
- TASK_UNINTERRUPTIBLE);
-
- if (iolatency_may_queue(iolat, &wait, first_block))
- break;
- first_block = false;
-
- if (lock) {
- spin_unlock_irq(lock);
- io_schedule();
- spin_lock_irq(lock);
- } else {
- io_schedule();
- }
- } while (1);
-
- finish_wait(&rqw->wait, &wait);
+ rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb);
}
#define SCALE_DOWN_FACTOR 2
@@ -255,7 +322,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
struct child_latency_info *lat_info,
bool up)
{
- unsigned long qd = blk_queue_depth(blkiolat->rqos.q);
+ unsigned long qd = blkiolat->rqos.q->nr_requests;
unsigned long scale = scale_amount(qd, up);
unsigned long old = atomic_read(&lat_info->scale_cookie);
unsigned long max_scale = qd << 1;
@@ -295,10 +362,9 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
*/
static void scale_change(struct iolatency_grp *iolat, bool up)
{
- unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q);
+ unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
unsigned long scale = scale_amount(qd, up);
unsigned long old = iolat->rq_depth.max_depth;
- bool changed = false;
if (old > qd)
old = qd;
@@ -308,15 +374,13 @@ static void scale_change(struct iolatency_grp *iolat, bool up)
return;
if (old < qd) {
- changed = true;
old += scale;
old = min(old, qd);
iolat->rq_depth.max_depth = old;
wake_up_all(&iolat->rq_wait.wait);
}
- } else if (old > 1) {
+ } else {
old >>= 1;
- changed = true;
iolat->rq_depth.max_depth = max(old, 1UL);
}
}
@@ -369,7 +433,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
* scale down event.
*/
samples_thresh = lat_info->nr_samples * 5;
- samples_thresh = div64_u64(samples_thresh, 100);
+ samples_thresh = max(1ULL, div64_u64(samples_thresh, 100));
if (iolat->nr_samples <= samples_thresh)
return;
}
@@ -391,38 +455,15 @@ static void check_scale_change(struct iolatency_grp *iolat)
scale_change(iolat, direction > 0);
}
-static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
- spinlock_t *lock)
+static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
{
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
- struct blkcg *blkcg;
- struct blkcg_gq *blkg;
- struct request_queue *q = rqos->q;
+ struct blkcg_gq *blkg = bio->bi_blkg;
bool issue_as_root = bio_issue_as_root_blkg(bio);
if (!blk_iolatency_enabled(blkiolat))
return;
- rcu_read_lock();
- blkcg = bio_blkcg(bio);
- bio_associate_blkcg(bio, &blkcg->css);
- blkg = blkg_lookup(blkcg, q);
- if (unlikely(!blkg)) {
- if (!lock)
- spin_lock_irq(q->queue_lock);
- blkg = blkg_lookup_create(blkcg, q);
- if (IS_ERR(blkg))
- blkg = NULL;
- if (!lock)
- spin_unlock_irq(q->queue_lock);
- }
- if (!blkg)
- goto out;
-
- bio_issue_init(&bio->bi_issue, bio_sectors(bio));
- bio_associate_blkg(bio, blkg);
-out:
- rcu_read_unlock();
while (blkg && blkg->parent) {
struct iolatency_grp *iolat = blkg_to_lat(blkg);
if (!iolat) {
@@ -431,7 +472,7 @@ out:
}
check_scale_change(iolat);
- __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root,
+ __blkcg_iolatency_throttle(rqos, iolat, issue_as_root,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
blkg = blkg->parent;
}
@@ -443,7 +484,6 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
struct bio_issue *issue, u64 now,
bool issue_as_root)
{
- struct blk_rq_stat *rq_stat;
u64 start = bio_issue_time(issue);
u64 req_time;
@@ -469,9 +509,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
return;
}
- rq_stat = get_cpu_ptr(iolat->stats);
- blk_rq_stat_add(rq_stat, req_time);
- put_cpu_ptr(rq_stat);
+ latency_stat_record_time(iolat, req_time);
}
#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC)
@@ -482,17 +520,17 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
struct blkcg_gq *blkg = lat_to_blkg(iolat);
struct iolatency_grp *parent;
struct child_latency_info *lat_info;
- struct blk_rq_stat stat;
+ struct latency_stat stat;
unsigned long flags;
- int cpu, exp_idx;
+ int cpu;
- blk_rq_stat_init(&stat);
+ latency_stat_init(iolat, &stat);
preempt_disable();
for_each_online_cpu(cpu) {
- struct blk_rq_stat *s;
+ struct latency_stat *s;
s = per_cpu_ptr(iolat->stats, cpu);
- blk_rq_stat_sum(&stat, s);
- blk_rq_stat_init(s);
+ latency_stat_sum(iolat, &stat, s);
+ latency_stat_init(iolat, s);
}
preempt_enable();
@@ -502,41 +540,36 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
lat_info = &parent->child_lat;
- /*
- * CALC_LOAD takes in a number stored in fixed point representation.
- * Because we are using this for IO time in ns, the values stored
- * are significantly larger than the FIXED_1 denominator (2048).
- * Therefore, rounding errors in the calculation are negligible and
- * can be ignored.
- */
- exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1,
- div64_u64(iolat->cur_win_nsec,
- BLKIOLATENCY_EXP_BUCKET_SIZE));
- CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean);
+ iolat_update_total_lat_avg(iolat, &stat);
/* Everything is ok and we don't need to adjust the scale. */
- if (stat.mean <= iolat->min_lat_nsec &&
+ if (latency_sum_ok(iolat, &stat) &&
atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE)
return;
/* Somebody beat us to the punch, just bail. */
spin_lock_irqsave(&lat_info->lock, flags);
+
+ latency_stat_sum(iolat, &iolat->cur_stat, &stat);
lat_info->nr_samples -= iolat->nr_samples;
- lat_info->nr_samples += stat.nr_samples;
- iolat->nr_samples = stat.nr_samples;
+ lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat);
+ iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat);
if ((lat_info->last_scale_event >= now ||
- now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) &&
- lat_info->scale_lat <= iolat->min_lat_nsec)
+ now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME))
goto out;
- if (stat.mean <= iolat->min_lat_nsec &&
- stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) {
+ if (latency_sum_ok(iolat, &iolat->cur_stat) &&
+ latency_sum_ok(iolat, &stat)) {
+ if (latency_stat_samples(iolat, &iolat->cur_stat) <
+ BLKIOLATENCY_MIN_GOOD_SAMPLES)
+ goto out;
if (lat_info->scale_grp == iolat) {
lat_info->last_scale_event = now;
scale_cookie_change(iolat->blkiolat, lat_info, true);
}
- } else if (stat.mean > iolat->min_lat_nsec) {
+ } else if (lat_info->scale_lat == 0 ||
+ lat_info->scale_lat >= iolat->min_lat_nsec) {
lat_info->last_scale_event = now;
if (!lat_info->scale_grp ||
lat_info->scale_lat > iolat->min_lat_nsec) {
@@ -545,6 +578,7 @@ static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now)
}
scale_cookie_change(iolat->blkiolat, lat_info, false);
}
+ latency_stat_init(iolat, &iolat->cur_stat);
out:
spin_unlock_irqrestore(&lat_info->lock, flags);
}
@@ -558,9 +592,10 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
u64 now = ktime_to_ns(ktime_get());
bool issue_as_root = bio_issue_as_root_blkg(bio);
bool enabled = false;
+ int inflight = 0;
blkg = bio->bi_blkg;
- if (!blkg)
+ if (!blkg || !bio_flagged(bio, BIO_TRACKED))
return;
iolat = blkg_to_lat(bio->bi_blkg);
@@ -568,6 +603,9 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
return;
enabled = blk_iolatency_enabled(iolat->blkiolat);
+ if (!enabled)
+ return;
+
while (blkg && blkg->parent) {
iolat = blkg_to_lat(blkg);
if (!iolat) {
@@ -576,8 +614,9 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
}
rqw = &iolat->rq_wait;
- atomic_dec(&rqw->inflight);
- if (!enabled || iolat->min_lat_nsec == 0)
+ inflight = atomic_dec_return(&rqw->inflight);
+ WARN_ON_ONCE(inflight < 0);
+ if (iolat->min_lat_nsec == 0)
goto next;
iolatency_record_time(iolat, &bio->bi_issue, now,
issue_as_root);
@@ -650,7 +689,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
* We could be exiting, don't access the pd unless we have a
* ref on the blkg.
*/
- if (!blkg_try_get(blkg))
+ if (!blkg_tryget(blkg))
continue;
iolat = blkg_to_lat(blkg);
@@ -721,10 +760,13 @@ int blk_iolatency_init(struct request_queue *q)
return 0;
}
-static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+/*
+ * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise
+ * return 0.
+ */
+static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
{
struct iolatency_grp *iolat = blkg_to_lat(blkg);
- struct blk_iolatency *blkiolat = iolat->blkiolat;
u64 oldval = iolat->min_lat_nsec;
iolat->min_lat_nsec = val;
@@ -733,9 +775,10 @@ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
BLKIOLATENCY_MAX_WIN_SIZE);
if (!oldval && val)
- atomic_inc(&blkiolat->enabled);
+ return 1;
if (oldval && !val)
- atomic_dec(&blkiolat->enabled);
+ return -1;
+ return 0;
}
static void iolatency_clear_scaling(struct blkcg_gq *blkg)
@@ -761,20 +804,19 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
{
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkcg_gq *blkg;
- struct blk_iolatency *blkiolat;
struct blkg_conf_ctx ctx;
struct iolatency_grp *iolat;
char *p, *tok;
u64 lat_val = 0;
u64 oldval;
int ret;
+ int enable = 0;
ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
if (ret)
return ret;
iolat = blkg_to_lat(ctx.blkg);
- blkiolat = iolat->blkiolat;
p = ctx.body;
ret = -EINVAL;
@@ -803,7 +845,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
blkg = ctx.blkg;
oldval = iolat->min_lat_nsec;
- iolatency_set_min_lat_nsec(blkg, lat_val);
+ enable = iolatency_set_min_lat_nsec(blkg, lat_val);
+ if (enable) {
+ WARN_ON_ONCE(!blk_get_queue(blkg->q));
+ blkg_get(blkg);
+ }
+
if (oldval != iolat->min_lat_nsec) {
iolatency_clear_scaling(blkg);
}
@@ -811,6 +858,24 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
ret = 0;
out:
blkg_conf_finish(&ctx);
+ if (ret == 0 && enable) {
+ struct iolatency_grp *tmp = blkg_to_lat(blkg);
+ struct blk_iolatency *blkiolat = tmp->blkiolat;
+
+ blk_mq_freeze_queue(blkg->q);
+
+ if (enable == 1)
+ atomic_inc(&blkiolat->enabled);
+ else if (enable == -1)
+ atomic_dec(&blkiolat->enabled);
+ else
+ WARN_ON_ONCE(1);
+
+ blk_mq_unfreeze_queue(blkg->q);
+
+ blkg_put(blkg);
+ blk_put_queue(blkg->q);
+ }
return ret ?: nbytes;
}
@@ -835,13 +900,43 @@ static int iolatency_print_limit(struct seq_file *sf, void *v)
return 0;
}
+static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
+ size_t size)
+{
+ struct latency_stat stat;
+ int cpu;
+
+ latency_stat_init(iolat, &stat);
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ struct latency_stat *s;
+ s = per_cpu_ptr(iolat->stats, cpu);
+ latency_stat_sum(iolat, &stat, s);
+ }
+ preempt_enable();
+
+ if (iolat->rq_depth.max_depth == UINT_MAX)
+ return scnprintf(buf, size, " missed=%llu total=%llu depth=max",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total);
+ return scnprintf(buf, size, " missed=%llu total=%llu depth=%u",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total,
+ iolat->rq_depth.max_depth);
+}
+
static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
size_t size)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
- unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
- unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
+ unsigned long long avg_lat;
+ unsigned long long cur_win;
+
+ if (iolat->ssd)
+ return iolatency_ssd_stat(iolat, buf, size);
+ avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
+ cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
if (iolat->rq_depth.max_depth == UINT_MAX)
return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
avg_lat, cur_win);
@@ -858,8 +953,8 @@ static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
iolat = kzalloc_node(sizeof(*iolat), gfp, node);
if (!iolat)
return NULL;
- iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat),
- __alignof__(struct blk_rq_stat), gfp);
+ iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
+ __alignof__(struct latency_stat), gfp);
if (!iolat->stats) {
kfree(iolat);
return NULL;
@@ -876,15 +971,21 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
u64 now = ktime_to_ns(ktime_get());
int cpu;
+ if (blk_queue_nonrot(blkg->q))
+ iolat->ssd = true;
+ else
+ iolat->ssd = false;
+
for_each_possible_cpu(cpu) {
- struct blk_rq_stat *stat;
+ struct latency_stat *stat;
stat = per_cpu_ptr(iolat->stats, cpu);
- blk_rq_stat_init(stat);
+ latency_stat_init(iolat, stat);
}
+ latency_stat_init(iolat, &iolat->cur_stat);
rq_wait_init(&iolat->rq_wait);
spin_lock_init(&iolat->child_lat.lock);
- iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q);
+ iolat->rq_depth.queue_depth = blkg->q->nr_requests;
iolat->rq_depth.max_depth = UINT_MAX;
iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
iolat->blkiolat = blkiolat;
@@ -910,8 +1011,14 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
struct blkcg_gq *blkg = lat_to_blkg(iolat);
+ struct blk_iolatency *blkiolat = iolat->blkiolat;
+ int ret;
- iolatency_set_min_lat_nsec(blkg, 0);
+ ret = iolatency_set_min_lat_nsec(blkg, 0);
+ if (ret == 1)
+ atomic_inc(&blkiolat->enabled);
+ if (ret == -1)
+ atomic_dec(&blkiolat->enabled);
iolatency_clear_scaling(blkg);
}