diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 385 |
1 files changed, 237 insertions, 148 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dea138964b91..76f67b3e34d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = { * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static __always_inline u64 decay_load(u64 val, u64 n) +static u64 decay_load(u64 val, u64 n) { unsigned int local_n; @@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u64 val, u64 n) return val; } -/* - * For updates fully spanning n periods, the contribution to runnable - * average will be: \Sum 1024*y^n - * - * We can compute this reasonably efficiently by combining: - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} - */ -static u32 __compute_runnable_contrib(u64 n) +static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder) { - u32 contrib = 0; + u32 c1, c2, c3 = remainder; /* y^0 == 1 */ + + if (!periods) + return remainder - period_contrib; - if (likely(n <= LOAD_AVG_PERIOD)) - return runnable_avg_yN_sum[n]; - else if (unlikely(n >= LOAD_AVG_MAX_N)) + if (unlikely(periods >= LOAD_AVG_MAX_N)) return LOAD_AVG_MAX; - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; - n %= LOAD_AVG_PERIOD; - contrib = decay_load(contrib, n); - return contrib + runnable_avg_yN_sum[n]; + /* + * c1 = d1 y^(p+1) + */ + c1 = decay_load((u64)(1024 - period_contrib), periods); + + periods -= 1; + /* + * For updates fully spanning n periods, the contribution to runnable + * average will be: + * + * c2 = 1024 \Sum y^n + * + * We can compute this reasonably efficiently by combining: + * + * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD} + */ + if (likely(periods <= LOAD_AVG_PERIOD)) { + c2 = runnable_avg_yN_sum[periods]; + } else { + c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD]; + periods %= LOAD_AVG_PERIOD; + c2 = decay_load(c2, periods); + c2 += runnable_avg_yN_sum[periods]; + } + + return c1 + c2 + c3; } #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) /* + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p + * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0 + * n=1 + * + * = u y^(p+1) + (Step 1) + * + * p + * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2) + * n=1 + */ +static __always_inline u32 +accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, + unsigned long weight, int running, struct cfs_rq *cfs_rq) +{ + unsigned long scale_freq, scale_cpu; + u64 periods; + u32 contrib; + + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ + + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + } + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_sum(periods, sa->period_contrib, delta); + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} + +/* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable * history into segments of approximately 1ms (1024us); label the segment that @@ -2849,13 +2930,10 @@ static u32 __compute_runnable_contrib(u64 n) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -__update_load_avg(u64 now, int cpu, struct sched_avg *sa, +___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, scaled_delta, periods; - u32 contrib; - unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq, scale_cpu; + u64 delta; delta = now - sa->last_update_time; /* @@ -2876,81 +2954,49 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, return 0; sa->last_update_time = now; - scale_freq = arch_scale_freq_capacity(NULL, cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->period_contrib; - if (delta + delta_w >= 1024) { - decayed = 1; - - /* how much left for next period will start over, we don't know yet */ - sa->period_contrib = 0; - - /* - * Now that we know we're crossing a period boundary, figure - * out how much from delta we need to complete the current - * period and accrue it. - */ - delta_w = 1024 - delta_w; - scaled_delta_w = cap_scale(delta_w, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta_w; - if (cfs_rq) { - cfs_rq->runnable_load_sum += - weight * scaled_delta_w; - } - } - if (running) - sa->util_sum += scaled_delta_w * scale_cpu; - - delta -= delta_w; - - /* Figure out how many additional periods this update spans */ - periods = delta / 1024; - delta %= 1024; + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + return 0; - sa->load_sum = decay_load(sa->load_sum, periods + 1); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods + 1); - } - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); - - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - contrib = __compute_runnable_contrib(periods); - contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } - if (running) - sa->util_sum += contrib * scale_cpu; + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - /* Remainder of delta accrued against u_0` */ - scaled_delta = cap_scale(delta, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * scaled_delta; - } - if (running) - sa->util_sum += scaled_delta * scale_cpu; + return 1; +} - sa->period_contrib += delta; +static int +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); +} - if (decayed) { - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); - } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - } +static int +__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); +} - return decayed; +static int +__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + return ___update_load_avg(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + cfs_rq->curr != NULL, cfs_rq); } /* @@ -3014,6 +3060,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { + u64 p_last_update_time; + u64 n_last_update_time; + if (!sched_feat(ATTACH_AGE_LOAD)) return; @@ -3024,11 +3073,11 @@ void set_task_rq_fair(struct sched_entity *se, * time. This will result in the wakee task is less decayed, but giving * the wakee more load sounds not bad. */ - if (se->avg.last_update_time && prev) { - u64 p_last_update_time; - u64 n_last_update_time; + if (!(se->avg.last_update_time && prev)) + return; #ifndef CONFIG_64BIT + { u64 p_last_update_time_copy; u64 n_last_update_time_copy; @@ -3043,14 +3092,13 @@ void set_task_rq_fair(struct sched_entity *se, } while (p_last_update_time != p_last_update_time_copy || n_last_update_time != n_last_update_time_copy); + } #else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), - &se->avg, 0, 0, NULL); - se->avg.last_update_time = n_last_update_time; - } + __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + se->avg.last_update_time = n_last_update_time; } /* Take into account change of utilization of a child task group */ @@ -3173,6 +3221,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 1; } +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} @@ -3265,8 +3343,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) set_tg_cfs_propagate(cfs_rq); } - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); + decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3298,11 +3375,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { - __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed |= propagate_entity_load_avg(se); @@ -3407,7 +3481,7 @@ void sync_entity_load_avg(struct sched_entity *se) u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); } /* @@ -4271,8 +4345,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4289,7 +4364,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); if (!remaining) break; @@ -5097,15 +5172,16 @@ void cpu_load_update_nohz_stop(void) unsigned long curr_jiffies = READ_ONCE(jiffies); struct rq *this_rq = this_rq(); unsigned long load; + struct rq_flags rf; if (curr_jiffies == this_rq->last_load_update_tick) return; load = weighted_cpuload(cpu_of(this_rq)); - raw_spin_lock(&this_rq->lock); + rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); - raw_spin_unlock(&this_rq->lock); + rq_unlock(this_rq, &rf); } #else /* !CONFIG_NO_HZ_COMMON */ static inline void cpu_load_update_nohz(struct rq *this_rq, @@ -6769,7 +6845,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - deactivate_task(env->src_rq, p, 0); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -6902,7 +6978,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6913,9 +6989,12 @@ static void attach_task(struct rq *rq, struct task_struct *p) */ static void attach_one_task(struct rq *rq, struct task_struct *p) { - raw_spin_lock(&rq->lock); + struct rq_flags rf; + + rq_lock(rq, &rf); + update_rq_clock(rq); attach_task(rq, p); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -6926,8 +7005,10 @@ static void attach_tasks(struct lb_env *env) { struct list_head *tasks = &env->tasks; struct task_struct *p; + struct rq_flags rf; - raw_spin_lock(&env->dst_rq->lock); + rq_lock(env->dst_rq, &rf); + update_rq_clock(env->dst_rq); while (!list_empty(tasks)) { p = list_first_entry(tasks, struct task_struct, se.group_node); @@ -6936,7 +7017,7 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } - raw_spin_unlock(&env->dst_rq->lock); + rq_unlock(env->dst_rq, &rf); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6944,9 +7025,9 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); /* @@ -6954,6 +7035,8 @@ static void update_blocked_averages(int cpu) * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; @@ -6961,11 +7044,12 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); - /* Propagate pending load changes to the parent */ - if (cfs_rq->tg->se[cpu]) - update_load_avg(cfs_rq->tg->se[cpu], 0); + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } /* @@ -7019,12 +7103,12 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } static unsigned long task_h_load(struct task_struct *p) @@ -7525,6 +7609,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; bool overload = false; @@ -7541,7 +7626,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); if (local_group) { sds->local = sg; - sgs = &sds->local_stat; + sgs = local; if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update)) @@ -7565,8 +7650,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * the tasks on the system). */ if (prefer_sibling && sds->local && - group_has_capacity(env, &sds->local_stat) && - (sgs->sum_nr_running > 1)) { + group_has_capacity(env, local) && + (sgs->sum_nr_running > local->sum_nr_running + 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } @@ -8042,7 +8127,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; - unsigned long flags; + struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { @@ -8105,7 +8190,7 @@ redo: env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - raw_spin_lock_irqsave(&busiest->lock, flags); + rq_lock_irqsave(busiest, &rf); update_rq_clock(busiest); /* @@ -8122,14 +8207,14 @@ more_balance: * See task_rq_lock() family for the details. */ - raw_spin_unlock(&busiest->lock); + rq_unlock(busiest, &rf); if (cur_ld_moved) { attach_tasks(&env); ld_moved += cur_ld_moved; } - local_irq_restore(flags); + local_irq_restore(rf.flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -8207,6 +8292,8 @@ more_balance: sd->nr_balance_failed++; if (need_active_balance(&env)) { + unsigned long flags; + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -8444,8 +8531,9 @@ static int active_load_balance_cpu_stop(void *data) struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; struct task_struct *p = NULL; + struct rq_flags rf; - raw_spin_lock_irq(&busiest_rq->lock); + rq_lock_irq(busiest_rq, &rf); /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -8496,7 +8584,7 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock(&busiest_rq->lock); + rq_unlock(busiest_rq, &rf); if (p) attach_one_task(target_rq, p); @@ -8794,10 +8882,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * do the balance. */ if (time_after_eq(jiffies, rq->next_balance)) { - raw_spin_lock_irq(&rq->lock); + struct rq_flags rf; + + rq_lock_irq(rq, &rf); update_rq_clock(rq); cpu_load_update_idle(rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); + rebalance_domains(rq, CPU_IDLE); } @@ -8988,8 +9079,9 @@ static void task_fork_fair(struct task_struct *p) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; struct rq *rq = this_rq(); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); @@ -9010,7 +9102,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -9372,7 +9464,6 @@ static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - unsigned long flags; /* * We can't change the weight of the root cgroup. @@ -9389,19 +9480,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) tg->shares = shares; for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se; + struct sched_entity *se = tg->se[i]; + struct rq_flags rf; - se = tg->se[i]; /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - - /* Possible calls to update_curr() need rq clock */ + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); for_each_sched_entity(se) { update_load_avg(se, UPDATE_TG); update_cfs_shares(se); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } done: |