diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 305 |
1 files changed, 198 insertions, 107 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a53be398f309..ba052a06da61 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2659,7 +2659,7 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) /* * We don't care about NUMA placement if we don't have memory. */ - if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) + if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) return; /* @@ -3488,9 +3488,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed) - cfs_rq_util_change(cfs_rq, 0); - return decayed; } @@ -3596,8 +3593,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); update_tg_load_avg(cfs_rq, 0); - } else if (decayed && (flags & UPDATE_TG)) - update_tg_load_avg(cfs_rq, 0); + } else if (decayed) { + cfs_rq_util_change(cfs_rq, 0); + + if (flags & UPDATE_TG) + update_tg_load_avg(cfs_rq, 0); + } } #ifndef CONFIG_64BIT @@ -3805,7 +3806,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - rq->misfit_task_load = task_h_load(p); + /* + * Make sure that misfit_task_load will not be null even if + * task_h_load() returns 0. + */ + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); } #else /* CONFIG_SMP */ @@ -3908,6 +3913,7 @@ static inline void check_schedstat_required(void) #endif } +static inline bool cfs_bandwidth_used(void); /* * MIGRATION @@ -3986,10 +3992,16 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - if (cfs_rq->nr_running == 1) { + /* + * When bandwidth control is enabled, cfs might have been removed + * because of a parent been throttled but cfs->nr_running > 1. Try to + * add it unconditionnally. + */ + if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) list_add_leaf_cfs_rq(cfs_rq); + + if (cfs_rq->nr_running == 1) check_enqueue_throttle(cfs_rq); - } } static void __clear_buddies_last(struct sched_entity *se) @@ -4483,7 +4495,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, dequeue = 1; + long task_delta, idle_task_delta, dequeue = 1; bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -4494,6 +4506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) rcu_read_unlock(); task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ @@ -4503,6 +4516,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) if (dequeue) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; if (qcfs_rq->load.weight) dequeue = 0; @@ -4541,8 +4555,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - int enqueue = 1; - long task_delta; + long task_delta, idle_task_delta; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4562,24 +4575,58 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) return; task_delta = cfs_rq->h_nr_running; + idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { if (se->on_rq) - enqueue = 0; + break; + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + } + + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - if (enqueue) - enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + cfs_rq->h_nr_running += task_delta; + cfs_rq->idle_h_nr_running += idle_task_delta; + + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) + goto unthrottle_throttle; + + /* + * One parent has been throttled and cfs_rq removed from the + * list. Add it back to not break the leaf list. + */ + if (throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + } + + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, task_delta); + +unthrottle_throttle: + /* + * The cfs_rq_throttled() breaks in the above iteration can result in + * incomplete leaf list maintenance, resulting in triggering the + * assertion below. + */ + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + if (list_add_leaf_cfs_rq(cfs_rq)) break; } assert_list_leaf_cfs_rq(rq); - if (!se) - add_nr_running(rq, task_delta); - /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); @@ -4904,6 +4951,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); + if (++count > 3) { u64 new, old = ktime_to_ns(cfs_b->period); @@ -4933,8 +4982,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) /* reset count so we don't come right back in here */ count = 0; } - - idle = do_sched_cfs_period_timer(cfs_b, overrun, flags); } if (idle) cfs_b->period_active = 0; @@ -5172,6 +5219,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int idle_h_nr_running = task_has_idle_policy(p); /* * The code below (indirectly) updates schedutil which looks at @@ -5195,30 +5243,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running increment below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto enqueue_throttle; flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + update_cfs_group(se); + cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; + goto enqueue_throttle; - update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_group(se); + /* + * One parent has been throttled and cfs_rq removed from the + * list. Add it back to not break the leaf list. + */ + if (throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } +enqueue_throttle: if (!se) { add_nr_running(rq, 1); /* @@ -5272,20 +5328,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + int idle_h_nr_running = task_has_idle_policy(p); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); - /* - * end evaluation on encountering a throttled cfs_rq - * - * note: in the case of encountering a throttled cfs_rq we will - * post the final h_nr_running decrement below. - */ - if (cfs_rq_throttled(cfs_rq)) - break; cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) + goto dequeue_throttle; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -5304,15 +5358,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); + + update_load_avg(cfs_rq, se, UPDATE_TG); + update_cfs_group(se); + cfs_rq->h_nr_running--; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; + /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) - break; + goto dequeue_throttle; - update_load_avg(cfs_rq, se, UPDATE_TG); - update_cfs_group(se); } +dequeue_throttle: if (!se) sub_nr_running(rq, 1); @@ -6186,6 +6245,7 @@ static inline int select_idle_smt(struct task_struct *p, int target) */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; u64 avg_cost, avg_idle; u64 time, cost; @@ -6216,11 +6276,11 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t time = local_clock(); - for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + + for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) return -1; - if (!cpumask_test_cpu(cpu, p->cpus_ptr)) - continue; if (available_idle_cpu(cpu)) break; } @@ -7602,7 +7662,15 @@ static int detach_tasks(struct lb_env *env) if (!can_migrate_task(p, env)) goto next; - load = task_h_load(p); + /* + * Depending of the number of CPUs and tasks and the + * cgroup hierarchy, task_h_load() can return a null + * value. Make sure that env->imbalance decreases + * otherwise detach_tasks() will stop only after + * detaching up to loop_max tasks. + */ + load = max_t(unsigned long, task_h_load(p), 1); + if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) goto next; @@ -7697,6 +7765,7 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } +#ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { if (cfs_rq->avg.load_avg) @@ -7724,6 +7793,41 @@ static inline bool others_have_blocked(struct rq *rq) return false; } +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) +{ + rq->last_blocked_load_update_tick = jiffies; + + if (!has_blocked) + rq->has_blocked_load = 0; +} +#else +static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } +static inline bool others_have_blocked(struct rq *rq) { return false; } +static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} +#endif + +static bool __update_blocked_others(struct rq *rq, bool *done) +{ + const struct sched_class *curr_class; + u64 now = rq_clock_pelt(rq); + bool decayed; + + /* + * update_load_avg() can call cpufreq_update_util(). Make sure that RT, + * DL and IRQ signals have been updated before updating CFS. + */ + curr_class = rq->curr->sched_class; + + decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | + update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | + update_irq_load_avg(rq, 0); + + if (others_have_blocked(rq)) + *done = false; + + return decayed; +} + #ifdef CONFIG_FAIR_GROUP_SCHED static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) @@ -7743,29 +7847,11 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) return true; } -static void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq, *pos; - const struct sched_class *curr_class; - struct rq_flags rf; - bool done = true; - - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); - - /* - * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure - * that RT, DL and IRQ signals have been updated before updating CFS. - */ - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); - - /* Don't need periodic decay once load/util_avg are null */ - if (others_have_blocked(rq)) - done = false; + bool decayed = false; + int cpu = cpu_of(rq); /* * Iterates the task_group tree in a bottom up fashion, see @@ -7774,9 +7860,13 @@ static void update_blocked_averages(int cpu) for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) { struct sched_entity *se; - if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) { update_tg_load_avg(cfs_rq, 0); + if (cfs_rq == &rq->cfs) + decayed = true; + } + /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) @@ -7791,15 +7881,10 @@ static void update_blocked_averages(int cpu) /* Don't need periodic decay once load/util_avg are null */ if (cfs_rq_has_blocked(cfs_rq)) - done = false; + *done = false; } -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (done) - rq->has_blocked_load = 0; -#endif - rq_unlock_irqrestore(rq, &rf); + return decayed; } /* @@ -7849,33 +7934,16 @@ static unsigned long task_h_load(struct task_struct *p) cfs_rq_load_avg(cfs_rq) + 1); } #else -static inline void update_blocked_averages(int cpu) +static bool __update_blocked_fair(struct rq *rq, bool *done) { - struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - const struct sched_class *curr_class; - struct rq_flags rf; - - rq_lock_irqsave(rq, &rf); - update_rq_clock(rq); + bool decayed; - /* - * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure - * that RT, DL and IRQ signals have been updated before updating CFS. - */ - curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); - update_irq_load_avg(rq, 0); + decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); + if (cfs_rq_has_blocked(cfs_rq)) + *done = false; - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); - -#ifdef CONFIG_NO_HZ_COMMON - rq->last_blocked_load_update_tick = jiffies; - if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) - rq->has_blocked_load = 0; -#endif - rq_unlock_irqrestore(rq, &rf); + return decayed; } static unsigned long task_h_load(struct task_struct *p) @@ -7884,6 +7952,24 @@ static unsigned long task_h_load(struct task_struct *p) } #endif +static void update_blocked_averages(int cpu) +{ + bool decayed = false, done = true; + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); + + decayed |= __update_blocked_others(rq, &done); + decayed |= __update_blocked_fair(rq, &done); + + update_blocked_load_status(rq, !done); + if (decayed) + cpufreq_update_util(rq, 0); + rq_unlock_irqrestore(rq, &rf); +} + /********** Helpers for find_busiest_group ************************/ /* @@ -9587,7 +9673,12 @@ static void kick_ilb(unsigned int flags) { int ilb_cpu; - nohz.next_balance++; + /* + * Increase nohz.next_balance only when if full ilb is triggered but + * not if we only update stats. + */ + if (flags & NOHZ_BALANCE_KICK) + nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); @@ -9906,6 +9997,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, } } + /* + * next_balance will be updated only when there is a need. + * When the CPU is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + nohz.next_balance = next_balance; + /* Newly idle CPU doesn't need an update */ if (idle != CPU_NEWLY_IDLE) { update_blocked_averages(this_cpu); @@ -9926,14 +10025,6 @@ abort: if (has_blocked_load) WRITE_ONCE(nohz.has_blocked, 1); - /* - * next_balance will be updated only when there is a need. - * When the CPU is attached to null domain for ex, it will not be - * updated. - */ - if (likely(update_next_balance)) - nohz.next_balance = next_balance; - return ret; } |