aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/clock.c21
-rw-r--r--kernel/sched/core.c322
-rw-r--r--kernel/sched/cpufreq_schedutil.c3
-rw-r--r--kernel/sched/deadline.c124
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c328
-rw-r--r--kernel/sched/psi.c48
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h107
-rw-r--r--kernel/sched/topology.c15
-rw-r--r--kernel/sched/wait.c7
11 files changed, 568 insertions, 411 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index b5cc2b53464d..3c6193de9cde 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -266,7 +266,7 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
s64 delta;
again:
- now = sched_clock();
+ now = sched_clock_noinstr();
delta = now - scd->tick_raw;
if (unlikely(delta < 0))
delta = 0;
@@ -287,28 +287,35 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
+ if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-noinstr u64 local_clock(void)
+noinstr u64 local_clock_noinstr(void)
{
u64 clock;
if (static_branch_likely(&__sched_clock_stable))
- return sched_clock() + __sched_clock_offset;
+ return sched_clock_noinstr() + __sched_clock_offset;
if (!static_branch_likely(&sched_clock_running))
- return sched_clock();
+ return sched_clock_noinstr();
- preempt_disable_notrace();
clock = sched_clock_local(this_scd());
- preempt_enable_notrace();
return clock;
}
+
+u64 local_clock(void)
+{
+ u64 now;
+ preempt_disable_notrace();
+ now = local_clock_noinstr();
+ preempt_enable_notrace();
+ return now;
+}
EXPORT_SYMBOL_GPL(local_clock);
static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a68d1276bab0..c52c2eba7c73 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2213,6 +2213,154 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
rq_clock_skip_update(rq);
}
+static __always_inline
+int __task_state_match(struct task_struct *p, unsigned int state)
+{
+ if (READ_ONCE(p->__state) & state)
+ return 1;
+
+#ifdef CONFIG_PREEMPT_RT
+ if (READ_ONCE(p->saved_state) & state)
+ return -1;
+#endif
+ return 0;
+}
+
+static __always_inline
+int task_state_match(struct task_struct *p, unsigned int state)
+{
+#ifdef CONFIG_PREEMPT_RT
+ int match;
+
+ /*
+ * Serialize against current_save_and_set_rtlock_wait_state() and
+ * current_restore_rtlock_saved_state().
+ */
+ raw_spin_lock_irq(&p->pi_lock);
+ match = __task_state_match(p, state);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return match;
+#else
+ return __task_state_match(p, state);
+#endif
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero. When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count). If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
+{
+ int running, queued, match;
+ struct rq_flags rf;
+ unsigned long ncsw;
+ struct rq *rq;
+
+ for (;;) {
+ /*
+ * We do the initial early heuristics without holding
+ * any task-queue locks at all. We'll only try to get
+ * the runqueue lock when things look like they will
+ * work out!
+ */
+ rq = task_rq(p);
+
+ /*
+ * If the task is actively running on another CPU
+ * still, just relax and busy-wait without holding
+ * any locks.
+ *
+ * NOTE! Since we don't hold any locks, it's not
+ * even sure that "rq" stays as the right runqueue!
+ * But we don't care, since "task_on_cpu()" will
+ * return false if the runqueue has changed and p
+ * is actually now running somewhere else!
+ */
+ while (task_on_cpu(rq, p)) {
+ if (!task_state_match(p, match_state))
+ return 0;
+ cpu_relax();
+ }
+
+ /*
+ * Ok, time to look more closely! We need the rq
+ * lock now, to be *sure*. If we're wrong, we'll
+ * just go back and repeat.
+ */
+ rq = task_rq_lock(p, &rf);
+ trace_sched_wait_task(p);
+ running = task_on_cpu(rq, p);
+ queued = task_on_rq_queued(p);
+ ncsw = 0;
+ if ((match = __task_state_match(p, match_state))) {
+ /*
+ * When matching on p->saved_state, consider this task
+ * still queued so it will wait.
+ */
+ if (match < 0)
+ queued = 1;
+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+ }
+ task_rq_unlock(rq, p, &rf);
+
+ /*
+ * If it changed from the expected state, bail out now.
+ */
+ if (unlikely(!ncsw))
+ break;
+
+ /*
+ * Was it really running after all now that we
+ * checked with the proper locks actually held?
+ *
+ * Oops. Go back and try again..
+ */
+ if (unlikely(running)) {
+ cpu_relax();
+ continue;
+ }
+
+ /*
+ * It's not enough that it's not actively running,
+ * it must be off the runqueue _entirely_, and not
+ * preempted!
+ *
+ * So if it was still runnable (but just not actively
+ * running right now), it's preempted, and we should
+ * yield - it could be a while.
+ */
+ if (unlikely(queued)) {
+ ktime_t to = NSEC_PER_SEC / HZ;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
+ continue;
+ }
+
+ /*
+ * Ahh, all good. It wasn't running, and it wasn't
+ * runnable, which means that it will never become
+ * running in the future either. We're all done!
+ */
+ break;
+ }
+
+ return ncsw;
+}
+
#ifdef CONFIG_SMP
static void
@@ -2398,7 +2546,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
if (!is_cpu_allowed(p, dest_cpu))
return rq;
- update_rq_clock(rq);
rq = move_queued_task(rq, rf, p, dest_cpu);
return rq;
@@ -2456,10 +2603,12 @@ static int migration_cpu_stop(void *data)
goto out;
}
- if (task_on_rq_queued(p))
+ if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
- else
+ } else {
p->wake_cpu = arg->dest_cpu;
+ }
/*
* XXX __migrate_task() can fail, at which point we might end
@@ -3341,114 +3490,6 @@ out:
}
#endif /* CONFIG_NUMA_BALANCING */
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * Wait for the thread to block in any of the states set in @match_state.
- * If it changes, i.e. @p might have woken up, then return zero. When we
- * succeed in waiting for @p to be off its CPU, we return a positive number
- * (its total switch count). If a second call a short while later returns the
- * same number, the caller can be sure that @p has remained unscheduled the
- * whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
-{
- int running, queued;
- struct rq_flags rf;
- unsigned long ncsw;
- struct rq *rq;
-
- for (;;) {
- /*
- * We do the initial early heuristics without holding
- * any task-queue locks at all. We'll only try to get
- * the runqueue lock when things look like they will
- * work out!
- */
- rq = task_rq(p);
-
- /*
- * If the task is actively running on another CPU
- * still, just relax and busy-wait without holding
- * any locks.
- *
- * NOTE! Since we don't hold any locks, it's not
- * even sure that "rq" stays as the right runqueue!
- * But we don't care, since "task_on_cpu()" will
- * return false if the runqueue has changed and p
- * is actually now running somewhere else!
- */
- while (task_on_cpu(rq, p)) {
- if (!(READ_ONCE(p->__state) & match_state))
- return 0;
- cpu_relax();
- }
-
- /*
- * Ok, time to look more closely! We need the rq
- * lock now, to be *sure*. If we're wrong, we'll
- * just go back and repeat.
- */
- rq = task_rq_lock(p, &rf);
- trace_sched_wait_task(p);
- running = task_on_cpu(rq, p);
- queued = task_on_rq_queued(p);
- ncsw = 0;
- if (READ_ONCE(p->__state) & match_state)
- ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, p, &rf);
-
- /*
- * If it changed from the expected state, bail out now.
- */
- if (unlikely(!ncsw))
- break;
-
- /*
- * Was it really running after all now that we
- * checked with the proper locks actually held?
- *
- * Oops. Go back and try again..
- */
- if (unlikely(running)) {
- cpu_relax();
- continue;
- }
-
- /*
- * It's not enough that it's not actively running,
- * it must be off the runqueue _entirely_, and not
- * preempted!
- *
- * So if it was still runnable (but just not actively
- * running right now), it's preempted, and we should
- * yield - it could be a while.
- */
- if (unlikely(queued)) {
- ktime_t to = NSEC_PER_SEC / HZ;
-
- set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
- continue;
- }
-
- /*
- * Ahh, all good. It wasn't running, and it wasn't
- * runnable, which means that it will never become
- * running in the future either. We're all done!
- */
- break;
- }
-
- return ncsw;
-}
-
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
@@ -4003,15 +4044,14 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
{
+ int match;
+
if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
state != TASK_RTLOCK_WAIT);
}
- if (READ_ONCE(p->__state) & state) {
- *success = 1;
- return true;
- }
+ *success = !!(match = __task_state_match(p, state));
#ifdef CONFIG_PREEMPT_RT
/*
@@ -4027,12 +4067,10 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
* p::saved_state to TASK_RUNNING so any further tests will
* not result in false positives vs. @success
*/
- if (p->saved_state & state) {
+ if (match < 0)
p->saved_state = TASK_RUNNING;
- *success = 1;
- }
#endif
- return false;
+ return match > 0;
}
/*
@@ -5632,6 +5670,9 @@ void scheduler_tick(void)
perf_event_task_tick();
+ if (curr->flags & PF_WQ_WORKER)
+ wq_worker_tick(curr);
+
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
@@ -7590,6 +7631,7 @@ static int __sched_setscheduler(struct task_struct *p,
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
+ bool cpuset_locked = false;
/* The pi code expects interrupts enabled */
BUG_ON(pi && in_interrupt());
@@ -7639,8 +7681,14 @@ recheck:
return retval;
}
- if (pi)
- cpuset_read_lock();
+ /*
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+ * information.
+ */
+ if (dl_policy(policy) || dl_policy(p->policy)) {
+ cpuset_locked = true;
+ cpuset_lock();
+ }
/*
* Make sure no PI-waiters arrive (or leave) while we are
@@ -7716,8 +7764,8 @@ change:
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
goto recheck;
}
@@ -7784,7 +7832,8 @@ change:
task_rq_unlock(rq, p, &rf);
if (pi) {
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
rt_mutex_adjust_pi(p);
}
@@ -7796,8 +7845,8 @@ change:
unlock:
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
return retval;
}
@@ -9286,8 +9335,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-int task_can_attach(struct task_struct *p,
- const struct cpumask *cs_effective_cpus)
+int task_can_attach(struct task_struct *p)
{
int ret = 0;
@@ -9300,21 +9348,9 @@ int task_can_attach(struct task_struct *p,
* success of set_cpus_allowed_ptr() on all attached tasks
* before cpus_mask may be changed.
*/
- if (p->flags & PF_NO_SETAFFINITY) {
+ if (p->flags & PF_NO_SETAFFINITY)
ret = -EINVAL;
- goto out;
- }
-
- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_effective_cpus)) {
- int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
-
- if (unlikely(cpu >= nr_cpu_ids))
- return -EINVAL;
- ret = dl_cpu_busy(cpu, p);
- }
-out:
return ret;
}
@@ -9548,6 +9584,7 @@ void set_rq_offline(struct rq *rq)
if (rq->online) {
const struct sched_class *class;
+ update_rq_clock(rq);
for_each_class(class) {
if (class->rq_offline)
class->rq_offline(rq);
@@ -9596,7 +9633,7 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- int ret = dl_cpu_busy(cpu, NULL);
+ int ret = dl_bw_check_overflow(cpu);
if (ret)
return ret;
@@ -9689,7 +9726,6 @@ int sched_cpu_deactivate(unsigned int cpu)
rq_lock_irqsave(rq, &rf);
if (rq->rd) {
- update_rq_clock(rq);
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e3211455b203..4492608b7d7f 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -155,10 +155,11 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
+ unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
struct rq *rq = cpu_rq(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
FREQUENCY_UTIL, NULL);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5a9a4b81c972..58b542bf2893 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,8 @@
* Fabio Checconi <fchecconi@gmail.com>
*/
+#include <linux/cpuset.h>
+
/*
* Default limits for DL period; on the top end we guard against small util
* tasks still getting ridiculously long effective runtimes, on the bottom end we
@@ -489,13 +491,6 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
-void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
-{
- raw_spin_lock_init(&dl_b->dl_runtime_lock);
- dl_b->dl_period = period;
- dl_b->dl_runtime = runtime;
-}
-
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
@@ -1260,43 +1255,39 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
}
/*
- * This function implements the GRUB accounting rule:
- * according to the GRUB reclaiming algorithm, the runtime is
- * not decreased as "dq = -dt", but as
- * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * This function implements the GRUB accounting rule. According to the
+ * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt",
+ * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt",
* where u is the utilization of the task, Umax is the maximum reclaimable
* utilization, Uinact is the (per-runqueue) inactive utilization, computed
* as the difference between the "total runqueue utilization" and the
- * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * "runqueue active utilization", and Uextra is the (per runqueue) extra
* reclaimable utilization.
- * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
- * multiplied by 2^BW_SHIFT, the result has to be shifted right by
- * BW_SHIFT.
- * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
- * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
- * Since delta is a 64 bit variable, to have an overflow its value
- * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
- * So, overflow is not an issue here.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
+ * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
+ * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value should be
+ * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
+ * not an issue here.
*/
static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
{
- u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
u64 u_act;
- u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+ u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
/*
- * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
- * we compare u_inact + rq->dl.extra_bw with
- * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
- * u_inact + rq->dl.extra_bw can be larger than
- * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
- * leading to wrong results)
+ * Instead of computing max{u, (u_max - u_inact - u_extra)}, we
+ * compare u_inact + u_extra with u_max - u, because u_inact + u_extra
+ * can be larger than u_max. So, u_max - u_inact - u_extra would be
+ * negative leading to wrong results.
*/
- if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
- u_act = u_act_min;
+ if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw)
+ u_act = dl_se->dl_bw;
else
- u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+ u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw;
+ u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT;
return (delta * u_act) >> BW_SHIFT;
}
@@ -2596,6 +2587,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && p->dl.dl_runtime)
task_non_contending(p);
+ /*
+ * In case a task is setscheduled out from SCHED_DEADLINE we need to
+ * keep track of that on its cpuset (for correct bandwidth tracking).
+ */
+ dec_dl_tasks_cs(p);
+
if (!task_on_rq_queued(p)) {
/*
* Inactive timer is armed. However, p is leaving DEADLINE and
@@ -2636,6 +2633,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
+ /*
+ * In case a task is setscheduled to SCHED_DEADLINE we need to keep
+ * track of that on its cpuset (for correct bandwidth tracking).
+ */
+ inc_dl_tasks_cs(p);
+
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p)) {
add_rq_bw(&p->dl, &rq->dl);
@@ -2795,12 +2798,12 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
{
if (global_rt_runtime() == RUNTIME_INF) {
dl_rq->bw_ratio = 1 << RATIO_SHIFT;
- dl_rq->extra_bw = 1 << BW_SHIFT;
+ dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT;
} else {
dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
- dl_rq->extra_bw = to_ratio(global_rt_period(),
- global_rt_runtime());
+ dl_rq->max_bw = dl_rq->extra_bw =
+ to_ratio(global_rt_period(), global_rt_runtime());
}
}
@@ -3044,26 +3047,38 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-int dl_cpu_busy(int cpu, struct task_struct *p)
+enum dl_bw_request {
+ dl_bw_req_check_overflow = 0,
+ dl_bw_req_alloc,
+ dl_bw_req_free
+};
+
+static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
{
- unsigned long flags, cap;
+ unsigned long flags;
struct dl_bw *dl_b;
- bool overflow;
+ bool overflow = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cap = dl_bw_capacity(cpu);
- overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
- if (!overflow && p) {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
+ if (req == dl_bw_req_free) {
+ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
+ } else {
+ unsigned long cap = dl_bw_capacity(cpu);
+
+ overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
+
+ if (req == dl_bw_req_alloc && !overflow) {
+ /*
+ * We reserve space in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
+ }
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
@@ -3071,6 +3086,21 @@ int dl_cpu_busy(int cpu, struct task_struct *p)
return overflow ? -EBUSY : 0;
}
+
+int dl_bw_check_overflow(int cpu)
+{
+ return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
+}
+
+int dl_bw_alloc(int cpu, u64 dl_bw)
+{
+ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw);
+}
+
+void dl_bw_free(int cpu, u64 dl_bw)
+{
+ dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
+}
#endif
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0b2340a79b65..066ff1c8ae4e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -777,7 +777,7 @@ static void print_cpu(struct seq_file *m, int cpu)
#define P(x) \
do { \
if (sizeof(rq->x) == 4) \
- SEQ_printf(m, " .%-30s: %ld\n", #x, (long)(rq->x)); \
+ SEQ_printf(m, " .%-30s: %d\n", #x, (int)(rq->x)); \
else \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\
} while (0)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 373ff5f55884..1d9c2482c5a3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+ int sibling;
+
+ for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+ if (cpu == sibling)
+ continue;
+
+ if (!idle_cpu(sibling))
+ return false;
+ }
+#endif
+
+ return true;
+}
+
#ifdef CONFIG_NUMA
#define NUMA_IMBALANCE_MIN 2
@@ -1700,23 +1717,6 @@ struct numa_stats {
int idle_cpu;
};
-static inline bool is_core_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- int sibling;
-
- for_each_cpu(sibling, cpu_smt_mask(cpu)) {
- if (cpu == sibling)
- continue;
-
- if (!idle_cpu(sibling))
- return false;
- }
-#endif
-
- return true;
-}
-
struct task_numa_env {
struct task_struct *p;
@@ -5577,6 +5577,14 @@ static void __cfsb_csd_unthrottle(void *arg)
rq_lock(rq, &rf);
/*
+ * Iterating over the list can trigger several call to
+ * update_rq_clock() in unthrottle_cfs_rq().
+ * Do it once and skip the potential next ones.
+ */
+ update_rq_clock(rq);
+ rq_clock_start_loop_update(rq);
+
+ /*
* Since we hold rq lock we're safe from concurrent manipulation of
* the CSD list. However, this RCU critical section annotates the
* fact that we pair with sched_free_group_rcu(), so that we cannot
@@ -5595,6 +5603,7 @@ static void __cfsb_csd_unthrottle(void *arg)
rcu_read_unlock();
+ rq_clock_stop_loop_update(rq);
rq_unlock(rq, &rf);
}
@@ -6115,6 +6124,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
+ /*
+ * The rq clock has already been updated in the
+ * set_rq_offline(), so we should skip updating
+ * the rq clock again in unthrottle_cfs_rq().
+ */
+ rq_clock_start_loop_update(rq);
+
rcu_read_lock();
list_for_each_entry_rcu(tg, &task_groups, list) {
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
@@ -6137,6 +6153,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
+
+ rq_clock_stop_loop_update(rq);
}
#else /* CONFIG_CFS_BANDWIDTH */
@@ -7156,7 +7174,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
+ cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) &&
asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
return recent_used_cpu;
}
@@ -7202,14 +7220,58 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
-/*
- * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
- * (@dst_cpu = -1) or migrated to @dst_cpu.
- */
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+/**
+ * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for
+ * @p: task for which the CPU utilization should be predicted or NULL
+ * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
+ * @boost: 1 to enable boosting, otherwise 0
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
+ * CPU contention for CFS tasks can be detected by CPU runnable > CPU
+ * utilization. Boosting is implemented in cpu_util() so that internal
+ * users (e.g. EAS) can use it next to external users (e.g. schedutil),
+ * latter via cpu_util_cfs_boost().
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Boosted) (estimated) utilization for the specified CPU.
+ */
+static unsigned long
+cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long runnable;
+
+ if (boost) {
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+ util = max(util, runnable);
+ }
/*
* If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
@@ -7217,9 +7279,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
* contribution. In all the other cases @cpu is not impacted by the
* migration so its util_avg is already correct.
*/
- if (task_cpu(p) == cpu && dst_cpu != cpu)
+ if (p && task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
- else if (task_cpu(p) != cpu && dst_cpu == cpu)
+ else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
@@ -7255,7 +7317,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
- else if (unlikely(task_on_rq_queued(p) || current == p))
+ else if (p && unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&util_est, _task_util_est(p));
util = max(util, util_est);
@@ -7264,6 +7326,16 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
return min(util, capacity_orig_of(cpu));
}
+unsigned long cpu_util_cfs(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 0);
+}
+
+unsigned long cpu_util_cfs_boost(int cpu)
+{
+ return cpu_util(cpu, NULL, -1, 1);
+}
+
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@@ -7281,9 +7353,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util_cfs(cpu);
+ p = NULL;
- return cpu_util_next(cpu, p, -1);
+ return cpu_util(cpu, p, -1, 0);
}
/*
@@ -7330,7 +7402,7 @@ static inline void eenv_task_busy_time(struct energy_env *eenv,
* cpu_capacity.
*
* The contribution of the task @p for which we want to estimate the
- * energy cost is removed (by cpu_util_next()) and must be calculated
+ * energy cost is removed (by cpu_util()) and must be calculated
* separately (see eenv_task_busy_time). This ensures:
*
* - A stable PD utilization, no matter which CPU of that PD we want to place
@@ -7351,7 +7423,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
int cpu;
for_each_cpu(cpu, pd_cpus) {
- unsigned long util = cpu_util_next(cpu, p, -1);
+ unsigned long util = cpu_util(cpu, p, -1, 0);
busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
}
@@ -7375,8 +7447,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
for_each_cpu(cpu, pd_cpus) {
struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
- unsigned long util = cpu_util_next(cpu, p, dst_cpu);
- unsigned long cpu_util;
+ unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+ unsigned long eff_util;
/*
* Performance domain frequency: utilization clamping
@@ -7385,8 +7457,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
* NOTE: in case RT tasks are running, by default the
* FREQUENCY_UTIL's utilization can be max OPP.
*/
- cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
- max_util = max(max_util, cpu_util);
+ eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+ max_util = max(max_util, eff_util);
}
return min(max_util, eenv->cpu_cap);
@@ -7521,7 +7593,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
continue;
- util = cpu_util_next(cpu, p, cpu);
+ util = cpu_util(cpu, p, cpu, 0);
cpu_cap = capacity_of(cpu);
/*
@@ -9331,96 +9403,61 @@ group_type group_classify(unsigned int imbalance_pct,
}
/**
- * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
- * @dst_cpu: Destination CPU of the load balancing
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
+ * @sd: The scheduling domain of the load balancing
+ * @cpu: A CPU
+ *
+ * Always use CPU priority when balancing load between SMT siblings. When
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
+ * use CPU priority if the whole core is idle.
+ *
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
+ */
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
+{
+ if (!sched_smt_active())
+ return true;
+
+ return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+}
+
+/**
+ * sched_asym - Check if the destination CPU can do asym_packing load balance
+ * @env: The load balancing environment
* @sds: Load-balancing data with statistics of the local group
* @sgs: Load-balancing statistics of the candidate busiest group
- * @sg: The candidate busiest group
+ * @group: The candidate busiest group
*
- * Check the state of the SMT siblings of both @sds::local and @sg and decide
- * if @dst_cpu can pull tasks.
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
+ * preferred CPU of @group.
*
- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
- * only if @dst_cpu has higher priority.
+ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
+ * can do asym_packing balance only if all its SMT siblings are idle. Also, it
+ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
+ * imbalances in the number of CPUS are dealt with in find_busiest_group().
*
- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
- * Bigger imbalances in the number of busy CPUs will be dealt with in
- * update_sd_pick_busiest().
+ * If we are balancing load within an SMT core, or at DIE domain level, always
+ * proceed.
*
- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
- * of @dst_cpu are idle and @sg has lower priority.
- *
- * Return: true if @dst_cpu can pull tasks, false otherwise.
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
+ * otherwise.
*/
-static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
- struct sg_lb_stats *sgs,
- struct sched_group *sg)
+static inline bool
+sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
+ struct sched_group *group)
{
-#ifdef CONFIG_SCHED_SMT
- bool local_is_smt, sg_is_smt;
- int sg_busy_cpus;
-
- local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
- sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
-
- sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
-
- if (!local_is_smt) {
- /*
- * If we are here, @dst_cpu is idle and does not have SMT
- * siblings. Pull tasks if candidate group has two or more
- * busy CPUs.
- */
- if (sg_busy_cpus >= 2) /* implies sg_is_smt */
- return true;
-
- /*
- * @dst_cpu does not have SMT siblings. @sg may have SMT
- * siblings and only one is busy. In such case, @dst_cpu
- * can help if it has higher priority and is idle (i.e.,
- * it has no running tasks).
- */
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
- }
-
- /* @dst_cpu has SMT siblings. */
-
- if (sg_is_smt) {
- int local_busy_cpus = sds->local->group_weight -
- sds->local_stat.idle_cpus;
- int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
-
- if (busy_cpus_delta == 1)
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
+ /* Ensure that the whole local core is idle, if applicable. */
+ if (!sched_use_asym_prio(env->sd, env->dst_cpu))
return false;
- }
/*
- * @sg does not have SMT siblings. Ensure that @sds::local does not end
- * up with more than one busy SMT sibling and only pull tasks if there
- * are not busy CPUs (i.e., no CPU has running tasks).
+ * CPU priorities does not make sense for SMT cores with more than one
+ * busy sibling.
*/
- if (!sds->local_stat.sum_nr_running)
- return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
- return false;
-#else
- /* Always return false so that callers deal with non-SMT cases. */
- return false;
-#endif
-}
-
-static inline bool
-sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
- struct sched_group *group)
-{
- /* Only do SMT checks if either local or candidate have SMT siblings */
- if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
- (group->flags & SD_SHARE_CPUCAPACITY))
- return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
+ if (group->flags & SD_SHARE_CPUCAPACITY) {
+ if (sgs->group_weight - sgs->idle_cpus != 1)
+ return false;
+ }
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
@@ -9610,10 +9647,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* contention when accessing shared HW resources.
*
* XXX for now avg_load is not computed and always 0 so we
- * select the 1st one.
+ * select the 1st one, except if @sg is composed of SMT
+ * siblings.
*/
- if (sgs->avg_load <= busiest->avg_load)
+
+ if (sgs->avg_load < busiest->avg_load)
return false;
+
+ if (sgs->avg_load == busiest->avg_load) {
+ /*
+ * SMT sched groups need more help than non-SMT groups.
+ * If @sg happens to also be SMT, either choice is good.
+ */
+ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+ return false;
+ }
+
break;
case group_has_spare:
@@ -10088,7 +10137,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
- struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
@@ -10129,8 +10177,13 @@ next_group:
sg = sg->next;
} while (sg != env->sd->groups);
- /* Tag domain that child domain prefers tasks go to siblings first */
- sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+ /*
+ * Indicate that the child domain of the busiest group prefers tasks
+ * go to a child's sibling domains first. NB the flags of a sched group
+ * are those of the child domain.
+ */
+ if (sds->busiest)
+ sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
if (env->sd->flags & SD_NUMA)
@@ -10440,7 +10493,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto out_balanced;
}
- /* Try to move all excess tasks to child's sibling domain */
+ /*
+ * Try to move all excess tasks to a sibling domain of the busiest
+ * group's child domain.
+ */
if (sds.prefer_sibling && local->group_type == group_has_spare &&
busiest->sum_nr_running > local->sum_nr_running + 1)
goto force_balance;
@@ -10542,8 +10598,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
nr_running == 1)
continue;
- /* Make sure we only pull tasks from a CPU of lower priority */
+ /*
+ * Make sure we only pull tasks from a CPU of lower priority
+ * when balancing between SMT siblings.
+ *
+ * If balancing between cores, let lower priority CPUs help
+ * SMT cores with more than one busy sibling.
+ */
if ((env->sd->flags & SD_ASYM_PACKING) &&
+ sched_use_asym_prio(env->sd, i) &&
sched_asym_prefer(i, env->dst_cpu) &&
nr_running == 1)
continue;
@@ -10581,7 +10644,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util_cfs(i);
+ util = cpu_util_cfs_boost(i);
/*
* Don't try to pull utilization from a CPU with one
@@ -10632,12 +10695,19 @@ static inline bool
asym_active_balance(struct lb_env *env)
{
/*
- * ASYM_PACKING needs to force migrate tasks from busy but
- * lower priority CPUs in order to pack all tasks in the
- * highest priority CPUs.
+ * ASYM_PACKING needs to force migrate tasks from busy but lower
+ * priority CPUs in order to pack all tasks in the highest priority
+ * CPUs. When done between cores, do it only if the whole core if the
+ * whole core is idle.
+ *
+ * If @env::src_cpu is an SMT core with busy siblings, let
+ * the lower priority @env::dst_cpu help it. Do not follow
+ * CPU priority.
*/
return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
- sched_asym_prefer(env->dst_cpu, env->src_cpu);
+ sched_use_asym_prio(env->sd, env->dst_cpu) &&
+ (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+ !sched_use_asym_prio(env->sd, env->src_cpu));
}
static inline bool
@@ -10744,7 +10814,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
- .dst_grpmask = sched_group_span(sd->groups),
+ .dst_grpmask = group_balance_mask(sd->groups),
.idle = idle,
.loop_break = SCHED_NR_MIGRATE_BREAK,
.cpus = cpus,
@@ -11371,9 +11441,13 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_PACKING; see if there's a more preferred CPU
* currently idle; in which case, kick the ILB to move tasks
* around.
+ *
+ * When balancing betwen cores, all the SMT siblings of the
+ * preferred CPU must be idle.
*/
for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
- if (sched_asym_prefer(i, cpu)) {
+ if (sched_use_asym_prio(sd, i) &&
+ sched_asym_prefer(i, cpu)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e072f6b31bf3..9bb3f2b3ccfc 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -160,7 +160,6 @@ __setup("psi=", setup_psi);
#define EXP_300s 2034 /* 1/exp(2s/300s) */
/* PSI trigger definitions */
-#define WINDOW_MIN_US 500000 /* Min window size is 500ms */
#define WINDOW_MAX_US 10000000 /* Max window size is 10s */
#define UPDATES_PER_WINDOW 10 /* 10 updates per window */
@@ -494,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total,
continue;
/* Generate an event */
- if (cmpxchg(&t->event, 0, 1) == 0)
- wake_up_interruptible(&t->event_wait);
+ if (cmpxchg(&t->event, 0, 1) == 0) {
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
+ }
t->last_event_time = now;
/* Reset threshold breach flag once event got generated */
t->pending_event = false;
@@ -1272,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
return 0;
}
-struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, enum psi_res res, struct file *file)
+struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
+ enum psi_res res, struct file *file,
+ struct kernfs_open_file *of)
{
struct psi_trigger *t;
enum psi_states state;
@@ -1305,8 +1309,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
- if (window_us < WINDOW_MIN_US ||
- window_us > WINDOW_MAX_US)
+ if (window_us == 0 || window_us > WINDOW_MAX_US)
return ERR_PTR(-EINVAL);
/*
@@ -1333,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->event = 0;
t->last_event_time = 0;
- init_waitqueue_head(&t->event_wait);
+ t->of = of;
+ if (!of)
+ init_waitqueue_head(&t->event_wait);
t->pending_event = false;
t->aggregator = privileged ? PSI_POLL : PSI_AVGS;
@@ -1390,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t)
* being accessed later. Can happen if cgroup is deleted from under a
* polling process.
*/
- wake_up_pollfree(&t->event_wait);
+ if (t->of)
+ kernfs_notify(t->of->kn);
+ else
+ wake_up_interruptible(&t->event_wait);
if (t->aggregator == PSI_AVGS) {
mutex_lock(&group->avgs_lock);
@@ -1409,11 +1417,16 @@ void psi_trigger_destroy(struct psi_trigger *t)
group->rtpoll_nr_triggers[t->state]--;
if (!group->rtpoll_nr_triggers[t->state])
group->rtpoll_states &= ~(1 << t->state);
- /* reset min update period for the remaining triggers */
- list_for_each_entry(tmp, &group->rtpoll_triggers, node)
- period = min(period, div_u64(tmp->win.size,
- UPDATES_PER_WINDOW));
- group->rtpoll_min_period = period;
+ /*
+ * Reset min update period for the remaining triggers
+ * iff the destroying trigger had the min window size.
+ */
+ if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
+ list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+ period = min(period, div_u64(tmp->win.size,
+ UPDATES_PER_WINDOW));
+ group->rtpoll_min_period = period;
+ }
/* Destroy rtpoll_task when the last trigger is destroyed */
if (group->rtpoll_states == 0) {
group->rtpoll_until = 0;
@@ -1462,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- poll_wait(file, &t->event_wait, wait);
+ if (t->of)
+ kernfs_generic_poll(t->of, wait);
+ else
+ poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
@@ -1532,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
return -EBUSY;
}
- new = psi_trigger_create(&psi_system, buf, res, file);
+ new = psi_trigger_create(&psi_system, buf, res, file, NULL);
if (IS_ERR(new)) {
mutex_unlock(&seq->lock);
return PTR_ERR(new);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 00e0e5074115..185d3d749f6b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -25,7 +25,7 @@ unsigned int sysctl_sched_rt_period = 1000000;
int sysctl_sched_rt_runtime = 950000;
#ifdef CONFIG_SYSCTL
-static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
static int sched_rt_handler(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ec7b3e0a2b20..e93e006a942b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -286,12 +286,6 @@ struct rt_bandwidth {
void __dl_clear_params(struct task_struct *p);
-struct dl_bandwidth {
- raw_spinlock_t dl_runtime_lock;
- u64 dl_runtime;
- u64 dl_period;
-};
-
static inline int dl_bandwidth_enabled(void)
{
return sysctl_sched_rt_runtime >= 0;
@@ -330,7 +324,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int dl_cpu_busy(int cpu, struct task_struct *p);
+extern int dl_bw_check_overflow(int cpu);
#ifdef CONFIG_CGROUP_SCHED
@@ -754,6 +748,12 @@ struct dl_rq {
u64 extra_bw;
/*
+ * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM
+ * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
+ */
+ u64 max_bw;
+
+ /*
* Inverse of the fraction of CPU utilization that can be reclaimed
* by the GRUB algorithm.
*/
@@ -1546,6 +1546,28 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq)
rq->clock_update_flags &= ~RQCF_REQ_SKIP;
}
+/*
+ * During cpu offlining and rq wide unthrottling, we can trigger
+ * an update_rq_clock() for several cfs and rt runqueues (Typically
+ * when using list_for_each_entry_*)
+ * rq_clock_start_loop_update() can be called after updating the clock
+ * once and before iterating over the list to prevent multiple update.
+ * After the iterative traversal, we need to call rq_clock_stop_loop_update()
+ * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
+ */
+static inline void rq_clock_start_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+ rq->clock_update_flags |= RQCF_ACT_SKIP;
+}
+
+static inline void rq_clock_stop_loop_update(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+}
+
struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
@@ -1772,6 +1794,13 @@ queue_balance_callback(struct rq *rq,
for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
__sd; __sd = __sd->parent)
+/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
+static const unsigned int SD_SHARED_CHILD_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
/**
* highest_flag_domain - Return highest sched_domain containing flag.
* @cpu: The CPU whose highest level of sched domain is to
@@ -1779,16 +1808,25 @@ queue_balance_callback(struct rq *rq,
* @flag: The flag to check for the highest sched_domain
* for the given CPU.
*
- * Returns the highest sched_domain of a CPU which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
+ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
*/
static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
{
struct sched_domain *sd, *hsd = NULL;
for_each_domain(cpu, sd) {
- if (!(sd->flags & flag))
+ if (sd->flags & flag) {
+ hsd = sd;
+ continue;
+ }
+
+ /*
+ * Stop the search if @flag is known to be shared at lower
+ * levels. It will not be found further up.
+ */
+ if (flag & SD_SHARED_CHILD_MASK)
break;
- hsd = sd;
}
return hsd;
@@ -2378,7 +2416,6 @@ extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
@@ -2946,53 +2983,9 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
return READ_ONCE(rq->avg_dl.util_avg);
}
-/**
- * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
- * @cpu: the CPU to get the utilization for.
- *
- * The unit of the return value must be the same as the one of CPU capacity
- * so that CPU utilization can be compared with CPU capacity.
- *
- * CPU utilization is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on that CPU.
- * It represents the amount of CPU capacity currently used by CFS tasks in
- * the range [0..max CPU capacity] with max CPU capacity being the CPU
- * capacity at f_max.
- *
- * The estimated CPU utilization is defined as the maximum between CPU
- * utilization and sum of the estimated utilization of the currently
- * runnable tasks on that CPU. It preserves a utilization "snapshot" of
- * previously-executed tasks, which helps better deduce how busy a CPU will
- * be when a long-sleeping task wakes up. The contribution to CPU utilization
- * of such a task would be significantly decayed at this point of time.
- *
- * CPU utilization can be higher than the current CPU capacity
- * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
- * of rounding errors as well as task migrations or wakeups of new tasks.
- * CPU utilization has to be capped to fit into the [0..max CPU capacity]
- * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
- * could be seen as over-utilized even though CPU1 has 20% of spare CPU
- * capacity. CPU utilization is allowed to overshoot current CPU capacity
- * though since this is useful for predicting the CPU capacity required
- * after task migrations (scheduler-driven DVFS).
- *
- * Return: (Estimated) utilization for the specified CPU.
- */
-static inline unsigned long cpu_util_cfs(int cpu)
-{
- struct cfs_rq *cfs_rq;
- unsigned long util;
-
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- if (sched_feat(UTIL_EST)) {
- util = max_t(unsigned long, util,
- READ_ONCE(cfs_rq->avg.util_est.enqueued));
- }
- return min(util, capacity_orig_of(cpu));
-}
+extern unsigned long cpu_util_cfs(int cpu);
+extern unsigned long cpu_util_cfs_boost(int cpu);
static inline unsigned long cpu_util_rt(struct rq *rq)
{
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 6682535e37c8..d3a3b2646ec4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -487,9 +487,9 @@ static void free_rootdomain(struct rcu_head *rcu)
void rq_attach_root(struct rq *rq, struct root_domain *rd)
{
struct root_domain *old_rd = NULL;
- unsigned long flags;
+ struct rq_flags rf;
- raw_spin_rq_lock_irqsave(rq, flags);
+ rq_lock_irqsave(rq, &rf);
if (rq->rd) {
old_rd = rq->rd;
@@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- raw_spin_rq_unlock_irqrestore(rq, flags);
+ rq_unlock_irqrestore(rq, &rf);
if (old_rd)
call_rcu(&old_rd->rcu, free_rootdomain);
@@ -719,8 +719,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
- if (parent->parent)
+
+ if (parent->parent) {
parent->parent->child = tmp;
+ if (tmp->flags & SD_SHARE_CPUCAPACITY)
+ parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY;
+ }
+
/*
* Transfer SD_PREFER_SIBLING down in case of a
* degenerate parent; the spans match for this
@@ -1676,7 +1681,7 @@ static struct sched_domain_topology_level *sched_domain_topology_saved;
#define for_each_sd_topology(tl) \
for (tl = sched_domain_topology; tl->mask; tl++)
-void set_sched_topology(struct sched_domain_topology_level *tl)
+void __init set_sched_topology(struct sched_domain_topology_level *tl)
{
if (WARN_ON_ONCE(sched_smp_initialized))
return;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 133b74730738..48c53e4739ea 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -425,11 +425,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
}
EXPORT_SYMBOL(autoremove_wake_function);
-static inline bool is_kthread_should_stop(void)
-{
- return (current->flags & PF_KTHREAD) && kthread_should_stop();
-}
-
/*
* DEFINE_WAIT_FUNC(wait, woken_wake_func);
*
@@ -459,7 +454,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
* or woken_wake_function() sees our store to current->state.
*/
set_current_state(mode); /* A */
- if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park())
timeout = schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);