aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c141
1 files changed, 83 insertions, 58 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a708d225c28e..9116bcc90346 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -57,6 +57,7 @@
#include <linux/profile.h>
#include <linux/psi.h>
#include <linux/rcuwait_api.h>
+#include <linux/rseq.h>
#include <linux/sched/wake_q.h>
#include <linux/scs.h>
#include <linux/slab.h>
@@ -1131,6 +1132,28 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
+ /*
+ * Set TIF_NEED_RESCHED and send an IPI if in the non-polling
+ * part of the idle loop. This forces an exit from the idle loop
+ * and a round trip to schedule(). Now this could be optimized
+ * because a simple new idle loop iteration is enough to
+ * re-evaluate the next tick. Provided some re-ordering of tick
+ * nohz functions that would need to follow TIF_NR_POLLING
+ * clearing:
+ *
+ * - On most archs, a simple fetch_or on ti::flags with a
+ * "0" value would be enough to know if an IPI needs to be sent.
+ *
+ * - x86 needs to perform a last need_resched() check between
+ * monitor and mwait which doesn't take timers into account.
+ * There a dedicated TIF_TIMER flag would be required to
+ * fetch_or here and be checked along with TIF_NEED_RESCHED
+ * before mwait().
+ *
+ * However, remote timer enqueue is not such a frequent event
+ * and testing of the above solutions didn't appear to report
+ * much benefits.
+ */
if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
else
@@ -2124,12 +2147,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
enqueue_task(rq, p, flags);
- p->on_rq = TASK_ON_RQ_QUEUED;
+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_QUEUED);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
}
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+ WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING);
+ ASSERT_EXCLUSIVE_WRITER(p->on_rq);
dequeue_task(rq, p, flags);
}
@@ -3795,6 +3820,8 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
rq->idle_stamp = 0;
}
#endif
+
+ p->dl_server = NULL;
}
/*
@@ -4509,10 +4536,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
memset(&p->stats, 0, sizeof(p->stats));
#endif
- RB_CLEAR_NODE(&p->dl.rb_node);
- init_dl_task_timer(&p->dl);
- init_dl_inactive_task_timer(&p->dl);
- __dl_clear_params(p);
+ init_dl_entity(&p->dl);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
@@ -6004,12 +6028,27 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = pick_next_task_idle(rq);
}
+ /*
+ * This is the fast path; it cannot be a DL server pick;
+ * therefore even if @p == @prev, ->dl_server must be NULL.
+ */
+ if (p->dl_server)
+ p->dl_server = NULL;
+
return p;
}
restart:
put_prev_task_balance(rq, prev, rf);
+ /*
+ * We've updated @prev and no longer need the server link, clear it.
+ * Must be done before ->pick_next_task() because that can (re)set
+ * ->dl_server.
+ */
+ if (prev->dl_server)
+ prev->dl_server = NULL;
+
for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
@@ -7429,18 +7468,13 @@ int sched_core_idle_cpu(int cpu)
* required to meet deadlines.
*/
unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
- enum cpu_util_type type,
- struct task_struct *p)
+ unsigned long *min,
+ unsigned long *max)
{
- unsigned long dl_util, util, irq, max;
+ unsigned long util, irq, scale;
struct rq *rq = cpu_rq(cpu);
- max = arch_scale_cpu_capacity(cpu);
-
- if (!uclamp_is_used() &&
- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
- return max;
- }
+ scale = arch_scale_cpu_capacity(cpu);
/*
* Early check to see if IRQ/steal time saturates the CPU, can be
@@ -7448,45 +7482,49 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* update_irq_load_avg().
*/
irq = cpu_util_irq(rq);
- if (unlikely(irq >= max))
- return max;
+ if (unlikely(irq >= scale)) {
+ if (min)
+ *min = scale;
+ if (max)
+ *max = scale;
+ return scale;
+ }
+
+ if (min) {
+ /*
+ * The minimum utilization returns the highest level between:
+ * - the computed DL bandwidth needed with the IRQ pressure which
+ * steals time to the deadline task.
+ * - The minimum performance requirement for CFS and/or RT.
+ */
+ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN));
+
+ /*
+ * When an RT task is runnable and uclamp is not used, we must
+ * ensure that the task will run at maximum compute capacity.
+ */
+ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt))
+ *min = max(*min, scale);
+ }
/*
* Because the time spend on RT/DL tasks is visible as 'lost' time to
* CFS tasks and we use the same metric to track the effective
* utilization (PELT windows are synchronized) we can directly add them
* to obtain the CPU's actual utilization.
- *
- * CFS and RT utilization can be boosted or capped, depending on
- * utilization clamp constraints requested by currently RUNNABLE
- * tasks.
- * When there are no CFS RUNNABLE tasks, clamps are released and
- * frequency will be gracefully reduced with the utilization decay.
*/
util = util_cfs + cpu_util_rt(rq);
- if (type == FREQUENCY_UTIL)
- util = uclamp_rq_util_with(rq, util, p);
-
- dl_util = cpu_util_dl(rq);
+ util += cpu_util_dl(rq);
/*
- * For frequency selection we do not make cpu_util_dl() a permanent part
- * of this sum because we want to use cpu_bw_dl() later on, but we need
- * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
- * that we select f_max when there is no idle time.
- *
- * NOTE: numerical errors or stop class might cause us to not quite hit
- * saturation when we should -- something for later.
+ * The maximum hint is a soft bandwidth requirement, which can be lower
+ * than the actual utilization because of uclamp_max requirements.
*/
- if (util + dl_util >= max)
- return max;
+ if (max)
+ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX));
- /*
- * OTOH, for energy computation we need the estimated running time, so
- * include util_dl and ignore dl_bw.
- */
- if (type == ENERGY_UTIL)
- util += dl_util;
+ if (util >= scale)
+ return scale;
/*
* There is still idle time; further improve the number by using the
@@ -7497,28 +7535,15 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
* U' = irq + --------- * U
* max
*/
- util = scale_irq_capacity(util, irq, max);
+ util = scale_irq_capacity(util, irq, scale);
util += irq;
- /*
- * Bandwidth required by DEADLINE must always be granted while, for
- * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
- * to gracefully reduce the frequency when no tasks show up for longer
- * periods of time.
- *
- * Ideally we would like to set bw_dl as min/guaranteed freq and util +
- * bw_dl as requested freq. However, cpufreq is not yet ready for such
- * an interface. So, we only do the latter for now.
- */
- if (type == FREQUENCY_UTIL)
- util += cpu_bw_dl(rq);
-
- return min(max, util);
+ return min(scale, util);
}
unsigned long sched_cpu_util(int cpu)
{
- return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
+ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL);
}
#endif /* CONFIG_SMP */