From 5693fa74f98afed5421ac0165e9e9291bde7d9e1 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 12 Nov 2021 18:52:02 +0000 Subject: kcsan: Use preemption model accessors Per PREEMPT_DYNAMIC, checking CONFIG_PREEMPT doesn't tell you the actual preemption model of the live kernel. Use the newly-introduced accessors instead. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Marco Elver Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20211112185203.280040-4-valentin.schneider@arm.com --- kernel/kcsan/kcsan_test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index a36fca063a73..767dfacd6ed3 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc) else nthreads *= 2; - if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { + if (!preempt_model_preemptible() || + !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) { /* * Without any preemption, keep 2 CPUs free for other tasks, one * of which is the main test case function checking for * completion or failure. */ - const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0; + const long min_unused_cpus = preempt_model_none() ? 2 : 0; const long min_required_cpus = 2 + min_unused_cpus; if (num_online_cpus() < min_required_cpus) { -- cgit From 089c02ae2771a14af2928c59c56abfb9b885a8d7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 12 Nov 2021 18:52:03 +0000 Subject: ftrace: Use preemption model accessors for trace header printout Per PREEMPT_DYNAMIC, checking CONFIG_PREEMPT doesn't tell you the actual preemption model of the live kernel. Use the newly-introduced accessors instead. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20211112185203.280040-5-valentin.schneider@arm.com --- kernel/trace/trace.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f4de111fa18f..124f1897fd56 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4289,17 +4289,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) entries, total, buf->cpu, -#if defined(CONFIG_PREEMPT_NONE) - "server", -#elif defined(CONFIG_PREEMPT_VOLUNTARY) - "desktop", -#elif defined(CONFIG_PREEMPT) - "preempt", -#elif defined(CONFIG_PREEMPT_RT) - "preempt_rt", -#else + preempt_model_none() ? "server" : + preempt_model_voluntary() ? "desktop" : + preempt_model_full() ? "preempt" : + preempt_model_rt() ? "preempt_rt" : "unknown", -#endif /* These are reserved for later use */ 0, 0, 0, 0); #ifdef CONFIG_SMP -- cgit From 915a087e4c47334a2f7ba2a4092c4bade0873769 Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Fri, 1 Apr 2022 13:10:11 +0800 Subject: psi: Fix trigger being fired unexpectedly at initial When a trigger being created, its win.start_value and win.start_time are reset to zero. If group->total[PSI_POLL][t->state] has accumulated before, this trigger will be fired unexpectedly in the next period, even if its growth time does not reach its threshold. So set the window of the new trigger to the current state value. Signed-off-by: Hailong Liu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Suren Baghdasaryan Link: https://lore.kernel.org/r/1648789811-3788971-1-git-send-email-liuhailong@linux.alibaba.com --- kernel/sched/psi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index a4fa3aadfcba..5a49a8c8783e 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1117,7 +1117,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->state = state; t->threshold = threshold_us * NSEC_PER_USEC; t->win.size = window_us * NSEC_PER_USEC; - window_reset(&t->win, 0, 0, 0); + window_reset(&t->win, sched_clock(), + group->total[PSI_POLL][t->state], 0); t->event = 0; t->last_event_time = 0; -- cgit From c9ca1762b25310beff332f882e20d1cf39bfaae5 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 6 Apr 2022 15:13:14 +0100 Subject: MAINTAINERS: Add myself as scheduler topology reviewer I've messed around the NUMA/debug bits of the scheduler toplogy in my time at Arm, and even though I've changed ships I still intend to at the very least review those bits. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220406141315.732473-2-vschneid@redhat.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fd768d43e048..74b739aae6b3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17508,6 +17508,7 @@ R: Steven Rostedt (SCHED_FIFO/SCHED_RR) R: Ben Segall (CONFIG_CFS_BANDWIDTH) R: Mel Gorman (CONFIG_NUMA_BALANCING) R: Daniel Bristot de Oliveira (SCHED_DEADLINE) +R: Valentin Schneider (TOPOLOGY) L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core -- cgit From e3f73ece75a88b79d14a811cec38a350a694c69d Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 6 Apr 2022 15:13:15 +0100 Subject: mailmap: Update my email address to @redhat.com I've brok^D contributed to a few different files in my time at Arm, so get_maintainer.pl spits out my name from time to time when it's using the git stats. Make that show an email address that's actually in use. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220406141315.732473-3-vschneid@redhat.com --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index b9d358217586..b76882fcc14a 100644 --- a/.mailmap +++ b/.mailmap @@ -391,6 +391,7 @@ Uwe Kleine-König Uwe Kleine-König Uwe Kleine-König Valdis Kletnieks +Valentin Schneider Vinod Koul Vinod Koul Vinod Koul -- cgit From 06354900787f25bf5be3c07a68e3cdbc5bf0fa69 Mon Sep 17 00:00:00 2001 From: zgpeng Date: Wed, 6 Apr 2022 17:57:05 +0800 Subject: sched/fair: Move calculate of avg_load to a better location In calculate_imbalance function, when the value of local->avg_load is greater than or equal to busiest->avg_load, the calculated sds->avg_load is not used. So this calculation can be placed in a more appropriate position. Signed-off-by: zgpeng Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Samuel Liao Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/1649239025-10010-1-git-send-email-zgpeng@tencent.com --- kernel/sched/fair.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bd299d67ab..601f8bdc39b9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9460,8 +9460,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / local->group_capacity; - sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / - sds->total_capacity; /* * If the local group is more loaded than the selected * busiest group don't try to pull any tasks. @@ -9470,6 +9468,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->imbalance = 0; return; } + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; } /* -- cgit From 64eaf50731ac0a8c76ce2fedd50ef6652aabc5ff Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 8 Apr 2022 19:53:08 +0800 Subject: sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq Since commit 23127296889f ("sched/fair: Update scale invariance of PELT") change to use rq_clock_pelt() instead of rq_clock_task(), we should also use rq_clock_pelt() for throttled_clock_task_time and throttled_clock_task accounting to get correct cfs_rq_clock_pelt() of throttled cfs_rq. And rename throttled_clock_task(_time) to be clock_pelt rather than clock_task. Fixes: 23127296889f ("sched/fair: Update scale invariance of PELT") Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20220408115309.81603-1-zhouchengming@bytedance.com --- kernel/sched/fair.c | 8 ++++---- kernel/sched/pelt.h | 4 ++-- kernel/sched/sched.h | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 601f8bdc39b9..f74b34080c9a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4846,8 +4846,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttle_count--; if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - - cfs_rq->throttled_clock_task; + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - + cfs_rq->throttled_clock_pelt; /* Add cfs_rq with load or one or more already running entities to the list */ if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) @@ -4864,7 +4864,7 @@ static int tg_throttle_down(struct task_group *tg, void *data) /* group is entering throttled state, stop time */ if (!cfs_rq->throttle_count) { - cfs_rq->throttled_clock_task = rq_clock_task(rq); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); list_del_leaf_cfs_rq(cfs_rq); } cfs_rq->throttle_count++; @@ -5308,7 +5308,7 @@ static void sync_throttle(struct task_group *tg, int cpu) pcfs_rq = tg->parent->cfs_rq[cpu]; cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu)); } /* conditionally throttle active cfs_rq's from put_prev_entity() */ diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index c336f5f481bc..4ff2ed4f8fa1 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq) static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; + return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; - return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; + return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time; } #else static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 58263f90c559..762be73972bd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -603,8 +603,8 @@ struct cfs_rq { s64 runtime_remaining; u64 throttled_clock; - u64 throttled_clock_task; - u64 throttled_clock_task_time; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; int throttled; int throttle_count; struct list_head throttled_list; -- cgit From 0a00a354644ee1800d31c47cf5927b9b50272fac Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 8 Apr 2022 19:53:09 +0800 Subject: sched/fair: Delete useless condition in tg_unthrottle_up() We have tested cfs_rq->load.weight in cfs_rq_is_decayed(), the first condition "!cfs_rq_is_decayed(cfs_rq)" is enough to cover the second condition "cfs_rq->nr_running". Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20220408115309.81603-2-zhouchengming@bytedance.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f74b34080c9a..3eba0dcc4962 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4850,7 +4850,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_pelt; /* Add cfs_rq with load or one or more already running entities to the list */ - if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) + if (!cfs_rq_is_decayed(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } -- cgit From 890d550d7dbac7a31ecaa78732aa22be282bb6b8 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 8 Apr 2022 20:19:14 +0800 Subject: sched/psi: report zeroes for CPU full at the system level Martin find it confusing when look at the /proc/pressure/cpu output, and found no hint about that CPU "full" line in psi Documentation. % cat /proc/pressure/cpu some avg10=0.92 avg60=0.91 avg300=0.73 total=933490489 full avg10=0.22 avg60=0.23 avg300=0.16 total=358783277 The PSI_CPU_FULL state is introduced by commit e7fcd7622823 ("psi: Add PSI_CPU_FULL state"), which mainly for cgroup level, but also counted at the system level as a side effect. Naturally, the FULL state doesn't exist for the CPU resource at the system level. These "full" numbers can come from CPU idle schedule latency. For example, t1 is the time when task wakeup on an idle CPU, t2 is the time when CPU pick and switch to it. The delta of (t2 - t1) will be in CPU_FULL state. Another case all processes can be stalled is when all cgroups have been throttled at the same time, which unlikely to happen. Anyway, CPU_FULL metric is meaningless and confusing at the system level. So this patch will report zeroes for CPU full at the system level, and update psi Documentation accordingly. Fixes: e7fcd7622823 ("psi: Add PSI_CPU_FULL state") Reported-by: Martin Steigerwald Suggested-by: Johannes Weiner Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lore.kernel.org/r/20220408121914.82855-1-zhouchengming@bytedance.com --- Documentation/accounting/psi.rst | 9 ++++----- kernel/sched/psi.c | 15 +++++++++------ 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst index 860fe651d645..5e40b3f437f9 100644 --- a/Documentation/accounting/psi.rst +++ b/Documentation/accounting/psi.rst @@ -37,11 +37,7 @@ Pressure interface Pressure information for each resource is exported through the respective file in /proc/pressure/ -- cpu, memory, and io. -The format for CPU is as such:: - - some avg10=0.00 avg60=0.00 avg300=0.00 total=0 - -and for memory and IO:: +The format is as such:: some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 @@ -58,6 +54,9 @@ situation from a state where some tasks are stalled but the CPU is still doing productive work. As such, time spent in this subset of the stall state is tracked separately and exported in the "full" averages. +CPU full is undefined at the system level, but has been reported +since 5.13, so it is set to zero for backward compatibility. + The ratios (in %) are tracked as recent trends over ten, sixty, and three hundred second windows, which gives insight into short term events as well as medium and long term trends. The total absolute stall time diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 5a49a8c8783e..a337f3e35997 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1060,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) mutex_unlock(&group->avgs_lock); for (full = 0; full < 2; full++) { - unsigned long avg[3]; - u64 total; + unsigned long avg[3] = { 0, }; + u64 total = 0; int w; - for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[PSI_AVGS][res * 2 + full], - NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some", -- cgit From a658353167bf2ea6052cee071dbcc13e0f229dc9 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 15 Apr 2022 17:55:04 +0800 Subject: sched/fair: Revise comment about lb decision matrix If busiest group type is group_misfit_task, the local group type must be group_has_spare according to below code in update_sd_pick_busiest(): if (sgs->group_type == group_misfit_task && (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) || sds->local_stat.group_type != group_has_spare)) return false; group type imbalanced and overloaded and fully_busy are filtered in here. misfit and asym are filtered before in update_sg_lb_stats(). So, change the decision matrix to: busiest \ local has_spare fully_busy misfit asym imbalanced overloaded has_spare nr_idle balanced N/A N/A balanced balanced fully_busy nr_idle nr_idle N/A N/A balanced balanced misfit_task force N/A N/A N/A *N/A* *N/A* asym_packing force force N/A N/A force force imbalanced force force N/A N/A force force overloaded force force N/A N/A force avg_load Fixes: 0b0695f2b34a ("sched/fair: Rework load_balance()") Signed-off-by: Tao Zhou Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20220415095505.7765-1-tao.zhou@linux.dev --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3eba0dcc4962..4c420124b5d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9496,7 +9496,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced - * misfit_task force N/A N/A N/A force force + * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load -- cgit From 4e3c7d338a2260406ae22eaf6d77b639d59bdc7e Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 18 Mar 2022 17:36:56 +0100 Subject: sched/fair: Refactor cpu_util_without() Except the 'task has no contribution or is new' condition at the beginning of cpu_util_without(), which it shares with the load and runnable counterpart functions, a cpu_util_next(..., dst_cpu = -1) call can replace the rest of it. The UTIL_EST specific check that task util_est has to be subtracted from the CPU one in case of an enqueued (or current (to cater for the wakeup - lb race)) task has to be moved to cpu_util_next(). This was initially introduced by commit c469933e7721 ("sched/fair: Fix cpu_util_wake() for 'execl' type workloads"). UnixBench's `execl` throughput tests were run on the dual socket 40 CPUs Intel E5-2690 v2 to make sure it doesn't regress again. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lore.kernel.org/r/20220318163656.954440-1-dietmar.eggemann@arm.com --- kernel/sched/fair.c | 157 +++++++++++++++++++--------------------------------- 1 file changed, 57 insertions(+), 100 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c420124b5d6..7d38728ebe97 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6544,108 +6544,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } /* - * cpu_util_without: compute cpu utilization without any contributions from *p - * @cpu: the CPU which utilization is requested - * @p: the task which utilization should be discounted - * - * The utilization of a CPU is defined by the utilization of tasks currently - * enqueued on that CPU as well as tasks which are currently sleeping after an - * execution on that CPU. - * - * This method returns the utilization of the specified CPU by discounting the - * utilization of the specified task, whenever the task is currently - * contributing to the CPU utilization. - */ -static unsigned long cpu_util_without(int cpu, struct task_struct *p) -{ - struct cfs_rq *cfs_rq; - unsigned int util; - - /* Task has no contribution or is new */ - if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) - return cpu_util_cfs(cpu); - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - /* Discount task's util from CPU's util */ - lsub_positive(&util, task_util(p)); - - /* - * Covered cases: - * - * a) if *p is the only task sleeping on this CPU, then: - * cpu_util (== task_util) > util_est (== 0) - * and thus we return: - * cpu_util_without = (cpu_util - task_util) = 0 - * - * b) if other tasks are SLEEPING on this CPU, which is now exiting - * IDLE, then: - * cpu_util >= task_util - * cpu_util > util_est (== 0) - * and thus we discount *p's blocked utilization to return: - * cpu_util_without = (cpu_util - task_util) >= 0 - * - * c) if other tasks are RUNNABLE on that CPU and - * util_est > cpu_util - * then we use util_est since it returns a more restrictive - * estimation of the spare capacity on that CPU, by just - * considering the expected utilization of tasks already - * runnable on that CPU. - * - * Cases a) and b) are covered by the above code, while case c) is - * covered by the following code when estimated utilization is - * enabled. - */ - if (sched_feat(UTIL_EST)) { - unsigned int estimated = - READ_ONCE(cfs_rq->avg.util_est.enqueued); - - /* - * Despite the following checks we still have a small window - * for a possible race, when an execl's select_task_rq_fair() - * races with LB's detach_task(): - * - * detach_task() - * p->on_rq = TASK_ON_RQ_MIGRATING; - * ---------------------------------- A - * deactivate_task() \ - * dequeue_task() + RaceTime - * util_est_dequeue() / - * ---------------------------------- B - * - * The additional check on "current == p" it's required to - * properly fix the execl regression and it helps in further - * reducing the chances for the above race. - */ - if (unlikely(task_on_rq_queued(p) || current == p)) - lsub_positive(&estimated, _task_util_est(p)); - - util = max(util, estimated); - } - - /* - * Utilization (estimated) can exceed the CPU capacity, thus let's - * clamp to the maximum CPU capacity to ensure consistency with - * cpu_util. - */ - return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - -/* - * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) - * to @dst_cpu. + * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu + * (@dst_cpu = -1) or migrated to @dst_cpu. */ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) { struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; - unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); + unsigned long util = READ_ONCE(cfs_rq->avg.util_avg); /* - * If @p migrates from @cpu to another, remove its contribution. Or, - * if @p migrates from another CPU to @cpu, add its contribution. In - * the other cases, @cpu is not impacted by the migration, so the - * util_avg should already be correct. + * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its + * contribution. If @p migrates from another CPU to @cpu add its + * contribution. In all the other cases @cpu is not impacted by the + * migration so its util_avg is already correct. */ if (task_cpu(p) == cpu && dst_cpu != cpu) lsub_positive(&util, task_util(p)); @@ -6653,16 +6564,40 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) util += task_util(p); if (sched_feat(UTIL_EST)) { + unsigned long util_est; + util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); /* - * During wake-up, the task isn't enqueued yet and doesn't - * appear in the cfs_rq->avg.util_est.enqueued of any rq, - * so just add it (if needed) to "simulate" what will be - * cpu_util after the task has been enqueued. + * During wake-up @p isn't enqueued yet and doesn't contribute + * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. + * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p + * has been enqueued. + * + * During exec (@dst_cpu = -1) @p is enqueued and does + * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. + * Remove it to "simulate" cpu_util without @p's contribution. + * + * Despite the task_on_rq_queued(@p) check there is still a + * small window for a possible race when an exec + * select_task_rq_fair() races with LB's detach_task(). + * + * detach_task() + * deactivate_task() + * p->on_rq = TASK_ON_RQ_MIGRATING; + * -------------------------------- A + * dequeue_task() \ + * dequeue_task_fair() + Race Time + * util_est_dequeue() / + * -------------------------------- B + * + * The additional check "current == p" is required to further + * reduce the race window. */ if (dst_cpu == cpu) util_est += _task_util_est(p); + else if (unlikely(task_on_rq_queued(p) || current == p)) + lsub_positive(&util_est, _task_util_est(p)); util = max(util, util_est); } @@ -6670,6 +6605,28 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) return min(util, capacity_orig_of(cpu)); } +/* + * cpu_util_without: compute cpu utilization without any contributions from *p + * @cpu: the CPU which utilization is requested + * @p: the task which utilization should be discounted + * + * The utilization of a CPU is defined by the utilization of tasks currently + * enqueued on that CPU as well as tasks which are currently sleeping after an + * execution on that CPU. + * + * This method returns the utilization of the specified CPU by discounting the + * utilization of the specified task, whenever the task is currently + * contributing to the CPU utilization. + */ +static unsigned long cpu_util_without(int cpu, struct task_struct *p) +{ + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_util_cfs(cpu); + + return cpu_util_next(cpu, p, -1); +} + /* * compute_energy(): Estimates the energy that @pd would consume if @p was * migrated to @dst_cpu. compute_energy() predicts what will be the utilization -- cgit From 50e7b416d2ab10b9771bd00a4d85df90ad2e4b37 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Apr 2022 15:43:37 +0100 Subject: sched/fair: Remove sched_trace_*() helper functions We no longer need them as we can use DWARF debug info or BTF + pahole to re-generate the required structs to compile against them for a given kernel. This moves the burden of maintaining these helper functions to the module. https://github.com/qais-yousef/sched_tp Note that pahole v1.15 is required at least for using DWARF. And for BTF v1.23 which is not yet released will be required. There's alignment problem that will lead to crashes in earlier versions when used with BTF. We should have enough infrastructure to make these helper functions now obsolete, so remove them. [Rewrote commit message to reflect the new alternative] Signed-off-by: Dietmar Eggemann Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220428144338.479094-2-qais.yousef@arm.com --- include/linux/sched.h | 14 -------- kernel/sched/fair.c | 98 --------------------------------------------------- 2 files changed, 112 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 67f06f72c50e..fc74ea2578b7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2378,20 +2378,6 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq); -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); - -int sched_trace_rq_cpu(struct rq *rq); -int sched_trace_rq_cpu_capacity(struct rq *rq); -int sched_trace_rq_nr_running(struct rq *rq); - -const struct cpumask *sched_trace_rd_span(struct root_domain *rd); - #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d38728ebe97..19803e1c9138 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11839,101 +11839,3 @@ __init void init_sched_fair_class(void) #endif /* SMP */ } - -/* - * Helper functions to facilitate extracting info from tracepoints. - */ - -const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) -{ -#ifdef CONFIG_SMP - return cfs_rq ? &cfs_rq->avg : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); - -char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) -{ - if (!cfs_rq) { - if (str) - strlcpy(str, "(null)", len); - else - return NULL; - } - - cfs_rq_tg_path(cfs_rq, str, len); - return str; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); - -int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) -{ - return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); - -const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_rt : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); - -const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq ? &rq->avg_dl : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); - -const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) - return rq ? &rq->avg_irq : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); - -int sched_trace_rq_cpu(struct rq *rq) -{ - return rq ? cpu_of(rq) : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); - -int sched_trace_rq_cpu_capacity(struct rq *rq) -{ - return rq ? -#ifdef CONFIG_SMP - rq->cpu_capacity -#else - SCHED_CAPACITY_SCALE -#endif - : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity); - -const struct cpumask *sched_trace_rd_span(struct root_domain *rd) -{ -#ifdef CONFIG_SMP - return rd ? rd->span : NULL; -#else - return NULL; -#endif -} -EXPORT_SYMBOL_GPL(sched_trace_rd_span); - -int sched_trace_rq_nr_running(struct rq *rq) -{ - return rq ? rq->nr_running : -1; -} -EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running); -- cgit From 97956dd278d3af1b5657026b992b54cf2e1b50b9 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Apr 2022 15:43:38 +0100 Subject: sched/fair: Remove cfs_rq_tg_path() cfs_rq_tg_path() is used by a tracepoint-to traceevent (tp-2-te) converter to format the path of a taskgroup or autogroup respectively. It doesn't have any in-kernel users after the removal of the sched_trace_cfs_rq_path() helper function. cfs_rq_tg_path() can be coded in a tp-2-te converter. Remove it from kernel/sched/fair.c. Signed-off-by: Dietmar Eggemann Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220428144338.479094-3-qais.yousef@arm.com --- kernel/sched/fair.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 19803e1c9138..6ca054b11a16 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -313,19 +313,6 @@ const struct sched_class fair_sched_class; #define for_each_sched_entity(se) \ for (; se; se = se->parent) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (!path) - return; - - if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) - autogroup_path(cfs_rq->tg, path, len); - else if (cfs_rq && cfs_rq->tg->css.cgroup) - cgroup_path(cfs_rq->tg->css.cgroup, path, len); - else - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -493,12 +480,6 @@ static int se_is_idle(struct sched_entity *se) #define for_each_sched_entity(se) \ for (; se; se = NULL) -static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) -{ - if (path) - strlcpy(path, "(null)", len); -} - static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { return true; -- cgit From d664e399128bd78b905ff480917e2c2d4949e101 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Apr 2022 15:31:02 +0200 Subject: sched: Fix missing prototype warnings A W=1 build emits more than a dozen missing prototype warnings related to scheduler and scheduler specific includes. Reported-by: kernel test robot Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220413133024.249118058@linutronix.de --- include/linux/sched.h | 2 ++ kernel/sched/build_policy.c | 2 ++ kernel/sched/build_utility.c | 1 + kernel/sched/core.c | 3 +++ kernel/sched/deadline.c | 2 -- kernel/sched/fair.c | 1 + kernel/sched/sched.h | 8 ++------ kernel/sched/smp.h | 6 ++++++ kernel/stop_machine.c | 2 -- 9 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index fc74ea2578b7..a27316f5f737 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2388,4 +2388,6 @@ static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } #endif +extern void sched_set_stop_task(int cpu, struct task_struct *stop); + #endif diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c index e0104b45029a..d9dc9ab3773f 100644 --- a/kernel/sched/build_policy.c +++ b/kernel/sched/build_policy.c @@ -15,6 +15,7 @@ /* Headers: */ #include #include +#include #include #include @@ -31,6 +32,7 @@ #include #include "sched.h" +#include "smp.h" #include "autogroup.h" #include "stats.h" diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index eec0849b2aae..99bdd96f454f 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 068c088e9584..e644578bc871 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,7 +26,10 @@ #include #include #include +#include #include +#include +#include #include #include #include diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fb4255ae0b2c..6ae423627a7a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1220,8 +1220,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se) return (dl_se->runtime <= 0); } -extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); - /* * This function implements the GRUB accounting rule: * according to the GRUB reclaiming algorithm, the runtime is diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6ca054b11a16..bc9f6e94c84e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 762be73972bd..4784898e8f83 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1833,12 +1833,7 @@ static inline void dirty_sched_domain_sysctl(int cpu) #endif extern int sched_update_scaling(void); - -extern void flush_smp_call_function_from_idle(void); - -#else /* !CONFIG_SMP: */ -static inline void flush_smp_call_function_from_idle(void) { } -#endif +#endif /* CONFIG_SMP */ #include "stats.h" @@ -2315,6 +2310,7 @@ extern void resched_cpu(int cpu); extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime); extern void init_dl_task_timer(struct sched_dl_entity *dl_se); diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 9620e323162c..5719bf9280e9 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -7,3 +7,9 @@ extern void sched_ttwu_pending(void *arg); extern void send_call_function_single_ipi(int cpu); + +#ifdef CONFIG_SMP +extern void flush_smp_call_function_from_idle(void); +#else +static inline void flush_smp_call_function_from_idle(void) { } +#endif diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index cbc30271ea4d..6da7b91af353 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -535,8 +535,6 @@ void stop_machine_park(int cpu) kthread_park(stopper->thread); } -extern void sched_set_stop_task(int cpu, struct task_struct *stop); - static void cpu_stop_create(unsigned int cpu) { sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu)); -- cgit From 16bf5a5e1ec56474ed2a19d72f272ed09a5d3ea1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Apr 2022 15:31:03 +0200 Subject: smp: Rename flush_smp_call_function_from_idle() This is invoked from the stopper thread too, which is definitely not idle. Rename it to flush_smp_call_function_queue() and fixup the callers. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220413133024.305001096@linutronix.de --- kernel/sched/core.c | 2 +- kernel/sched/idle.c | 2 +- kernel/sched/smp.h | 4 ++-- kernel/smp.c | 27 ++++++++++++++++++++------- 4 files changed, 24 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e644578bc871..07bacb050198 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2411,7 +2411,7 @@ static int migration_cpu_stop(void *data) * __migrate_task() such that we will not miss enforcing cpus_ptr * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 8f8b5020e76a..60295dbe0d62 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -327,7 +327,7 @@ static void do_idle(void) * RCU relies on this call to be done outside of an RCU read-side * critical section. */ - flush_smp_call_function_from_idle(); + flush_smp_call_function_queue(); schedule_idle(); if (unlikely(klp_patch_pending(current))) diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h index 5719bf9280e9..2eb23dd0f285 100644 --- a/kernel/sched/smp.h +++ b/kernel/sched/smp.h @@ -9,7 +9,7 @@ extern void sched_ttwu_pending(void *arg); extern void send_call_function_single_ipi(int cpu); #ifdef CONFIG_SMP -extern void flush_smp_call_function_from_idle(void); +extern void flush_smp_call_function_queue(void); #else -static inline void flush_smp_call_function_from_idle(void) { } +static inline void flush_smp_call_function_queue(void) { } #endif diff --git a/kernel/smp.c b/kernel/smp.c index 01a7c1706a58..8e85f22ed538 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); -static void flush_smp_call_function_queue(bool warn_cpu_offline); +static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { @@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu) * ensure that the outgoing CPU doesn't go offline with work * still pending. */ - flush_smp_call_function_queue(false); + __flush_smp_call_function_queue(false); irq_work_run(); return 0; } @@ -541,11 +541,11 @@ void generic_smp_call_function_single_interrupt(void) { cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_GOTIPI); - flush_smp_call_function_queue(true); + __flush_smp_call_function_queue(true); } /** - * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. @@ -558,7 +558,7 @@ void generic_smp_call_function_single_interrupt(void) * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ -static void flush_smp_call_function_queue(bool warn_cpu_offline) +static void __flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; @@ -681,7 +681,20 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) smp_processor_id(), CFD_SEQ_HDLEND); } -void flush_smp_call_function_from_idle(void) + +/** + * flush_smp_call_function_queue - Flush pending smp-call-function callbacks + * from task context (idle, migration thread) + * + * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it + * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by + * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to + * handle queued SMP function calls before scheduling. + * + * The migration thread has to ensure that an eventually pending wakeup has + * been handled before it migrates a task. + */ +void flush_smp_call_function_queue(void) { unsigned long flags; @@ -691,7 +704,7 @@ void flush_smp_call_function_from_idle(void) cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_IDLE); local_irq_save(flags); - flush_smp_call_function_queue(true); + __flush_smp_call_function_queue(true); if (local_softirq_pending()) do_softirq(); -- cgit From 1a90bfd220201fbe050dfc15deaac20ca5f15638 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 13 Apr 2022 15:31:05 +0200 Subject: smp: Make softirq handling RT safe in flush_smp_call_function_queue() flush_smp_call_function_queue() invokes do_softirq() which is not available on PREEMPT_RT. flush_smp_call_function_queue() is invoked from the idle task and the migration task with preemption or interrupts disabled. So RT kernels cannot process soft interrupts in that context as that has to acquire 'sleeping spinlocks' which is not possible with preemption or interrupts disabled and forbidden from the idle task anyway. The currently known SMP function call which raises a soft interrupt is in the block layer, but this functionality is not enabled on RT kernels due to latency and performance reasons. RT could wake up ksoftirqd unconditionally, but this wants to be avoided if there were soft interrupts pending already when this is invoked in the context of the migration task. The migration task might have preempted a threaded interrupt handler which raised a soft interrupt, but did not reach the local_bh_enable() to process it. The "running" ksoftirqd might prevent the handling in the interrupt thread context which is causing latency issues. Add a new function which handles this case explicitely for RT and falls back to do_softirq() on !RT kernels. In the RT case this warns when one of the flushed SMP function calls raised a soft interrupt so this can be investigated. [ tglx: Moved the RT part out of SMP code ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/YgKgL6aPj8aBES6G@linutronix.de Link: https://lore.kernel.org/r/20220413133024.356509586@linutronix.de --- include/linux/interrupt.h | 9 +++++++++ kernel/smp.c | 5 ++++- kernel/softirq.c | 13 +++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f40754caaefa..a49fe8d88676 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -607,6 +607,15 @@ struct softirq_action asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); +#ifdef CONFIG_PREEMPT_RT +extern void do_softirq_post_smp_call_flush(unsigned int was_pending); +#else +static inline void do_softirq_post_smp_call_flush(unsigned int unused) +{ + do_softirq(); +} +#endif + extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); diff --git a/kernel/smp.c b/kernel/smp.c index 8e85f22ed538..d54c2fe51ada 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -696,6 +696,7 @@ static void __flush_smp_call_function_queue(bool warn_cpu_offline) */ void flush_smp_call_function_queue(void) { + unsigned int was_pending; unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) @@ -704,9 +705,11 @@ void flush_smp_call_function_queue(void) cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_IDLE); local_irq_save(flags); + /* Get the already pending soft interrupts for RT enabled kernels */ + was_pending = local_softirq_pending(); __flush_smp_call_function_queue(true); if (local_softirq_pending()) - do_softirq(); + do_softirq_post_smp_call_flush(was_pending); local_irq_restore(flags); } diff --git a/kernel/softirq.c b/kernel/softirq.c index fac801815554..9f0aef8aa9ff 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -294,6 +294,19 @@ static inline void invoke_softirq(void) wakeup_softirqd(); } +/* + * flush_smp_call_function_queue() can raise a soft interrupt in a function + * call. On RT kernels this is undesired and the only known functionality + * in the block layer which does this is disabled on RT. If soft interrupts + * get raised which haven't been raised before the flush, warn so it can be + * investigated. + */ +void do_softirq_post_smp_call_flush(unsigned int was_pending) +{ + if (WARN_ON_ONCE(was_pending != local_softirq_pending())) + invoke_softirq(); +} + #else /* CONFIG_PREEMPT_RT */ /* -- cgit From 2679a83731d51a744657f718fc02c3b077e47562 Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 30 Apr 2022 16:58:42 +0800 Subject: sched/core: Avoid obvious double update_rq_clock warning When we use raw_spin_rq_lock() to acquire the rq lock and have to update the rq clock while holding the lock, the kernel may issue a WARN_DOUBLE_CLOCK warning. Since we directly use raw_spin_rq_lock() to acquire rq lock instead of rq_lock(), there is no corresponding change to rq->clock_update_flags. In particular, we have obtained the rq lock of other CPUs, the rq->clock_update_flags of this CPU may be RQCF_UPDATED at this time, and then calling update_rq_clock() will trigger the WARN_DOUBLE_CLOCK warning. So we need to clear RQCF_UPDATED of rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. For the sched_rt_period_timer() and migrate_task_rq_dl() cases we simply replace raw_spin_rq_lock()/raw_spin_rq_unlock() with rq_lock()/rq_unlock(). For the {pull,push}_{rt,dl}_task() cases, we add the double_rq_clock_clear_update() function to clear RQCF_UPDATED of rq->clock_update_flags, and call double_rq_clock_clear_update() before double_lock_balance()/double_rq_lock() returns to avoid the WARN_DOUBLE_CLOCK warning. Some call trace reports: Call Trace 1: sched_rt_period_timer+0x10f/0x3a0 ? enqueue_top_rt_rq+0x110/0x110 __hrtimer_run_queues+0x1a9/0x490 hrtimer_interrupt+0x10b/0x240 __sysvec_apic_timer_interrupt+0x8a/0x250 sysvec_apic_timer_interrupt+0x9a/0xd0 asm_sysvec_apic_timer_interrupt+0x12/0x20 Call Trace 2: activate_task+0x8b/0x110 push_rt_task.part.108+0x241/0x2c0 push_rt_tasks+0x15/0x30 finish_task_switch+0xaa/0x2e0 ? __switch_to+0x134/0x420 __schedule+0x343/0x8e0 ? hrtimer_start_range_ns+0x101/0x340 schedule+0x4e/0xb0 do_nanosleep+0x8e/0x160 hrtimer_nanosleep+0x89/0x120 ? hrtimer_init_sleeper+0x90/0x90 __x64_sys_nanosleep+0x96/0xd0 do_syscall_64+0x34/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Call Trace 3: deactivate_task+0x93/0xe0 pull_rt_task+0x33e/0x400 balance_rt+0x7e/0x90 __schedule+0x62f/0x8e0 do_task_dead+0x3f/0x50 do_exit+0x7b8/0xbb0 do_group_exit+0x2d/0x90 get_signal+0x9df/0x9e0 ? preempt_count_add+0x56/0xa0 ? __remove_hrtimer+0x35/0x70 arch_do_signal_or_restart+0x36/0x720 ? nanosleep_copyout+0x39/0x50 ? do_nanosleep+0x131/0x160 ? audit_filter_inodes+0xf5/0x120 exit_to_user_mode_prepare+0x10f/0x1e0 syscall_exit_to_user_mode+0x17/0x30 do_syscall_64+0x40/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Call Trace 4: update_rq_clock+0x128/0x1a0 migrate_task_rq_dl+0xec/0x310 set_task_cpu+0x84/0x1e4 try_to_wake_up+0x1d8/0x5c0 wake_up_process+0x1c/0x30 hrtimer_wakeup+0x24/0x3c __hrtimer_run_queues+0x114/0x270 hrtimer_interrupt+0xe8/0x244 arch_timer_handler_phys+0x30/0x50 handle_percpu_devid_irq+0x88/0x140 generic_handle_domain_irq+0x40/0x60 gic_handle_irq+0x48/0xe0 call_on_irq_stack+0x2c/0x60 do_interrupt_handler+0x80/0x84 Steps to reproduce: 1. Enable CONFIG_SCHED_DEBUG when compiling the kernel 2. echo 1 > /sys/kernel/debug/clear_warn_once echo "WARN_DOUBLE_CLOCK" > /sys/kernel/debug/sched/features echo "NO_RT_PUSH_IPI" > /sys/kernel/debug/sched/features 3. Run some rt/dl tasks that periodically work and sleep, e.g. Create 2*n rt or dl (90% running) tasks via rt-app (on a system with n CPUs), and Dietmar Eggemann reports Call Trace 4 when running on PREEMPT_RT kernel. Signed-off-by: Hao Jia Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20220430085843.62939-2-jiahao.os@bytedance.com --- kernel/sched/core.c | 6 +++--- kernel/sched/deadline.c | 5 +++-- kernel/sched/rt.c | 5 +++-- kernel/sched/sched.h | 28 ++++++++++++++++++++++++---- 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 48cfad152b86..2c9f5e97682f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -613,10 +613,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2) swap(rq1, rq2); raw_spin_rq_lock(rq1); - if (__rq_lockp(rq1) == __rq_lockp(rq2)) - return; + if (__rq_lockp(rq1) != __rq_lockp(rq2)) + raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); - raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(rq1, rq2); } #endif diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6ae423627a7a..0ad2818e77a2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1830,6 +1830,7 @@ out: static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { + struct rq_flags rf; struct rq *rq; if (READ_ONCE(p->__state) != TASK_WAKING) @@ -1841,7 +1842,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused * from try_to_wake_up(). Hence, p->pi_lock is locked, but * rq->lock is not... So, lock it */ - raw_spin_rq_lock(rq); + rq_lock(rq, &rf); if (p->dl.dl_non_contending) { update_rq_clock(rq); sub_running_bw(&p->dl, &rq->dl); @@ -1857,7 +1858,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused put_task_struct(p); } sub_rq_bw(&p->dl, &rq->dl); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); } static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a32c46889af8..7891c0f0e1ff 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); struct rq *rq = rq_of_rt_rq(rt_rq); + struct rq_flags rf; int skip; /* @@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (skip) continue; - raw_spin_rq_lock(rq); + rq_lock(rq, &rf); update_rq_clock(rq); if (rt_rq->rt_time) { @@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (enqueue) sched_rt_rq_enqueue(rt_rq); - raw_spin_rq_unlock(rq); + rq_unlock(rq, &rf); } if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7f338c53ce42..fe4d1acb7e38 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2474,6 +2474,24 @@ unsigned long arch_scale_freq_capacity(int cpu) } #endif +#ifdef CONFIG_SCHED_DEBUG +/* + * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to + * acquire rq lock instead of rq_lock(). So at the end of these two functions + * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of + * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning. + */ +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) +{ + rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */ +#ifdef CONFIG_SMP + rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); +#endif +} +#else +static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {} +#endif #ifdef CONFIG_SMP @@ -2539,14 +2557,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) __acquires(busiest->lock) __acquires(this_rq->lock) { - if (__rq_lockp(this_rq) == __rq_lockp(busiest)) - return 0; - - if (likely(raw_spin_rq_trylock(busiest))) + if (__rq_lockp(this_rq) == __rq_lockp(busiest) || + likely(raw_spin_rq_trylock(busiest))) { + double_rq_clock_clear_update(this_rq, busiest); return 0; + } if (rq_order_less(this_rq, busiest)) { raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING); + double_rq_clock_clear_update(this_rq, busiest); return 0; } @@ -2640,6 +2659,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) BUG_ON(rq1 != rq2); raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ + double_rq_clock_clear_update(rq1, rq2); } /* -- cgit From 734387ec2f9d77b00276042b1fa7c95f48ee879d Mon Sep 17 00:00:00 2001 From: Hao Jia Date: Sat, 30 Apr 2022 16:58:43 +0800 Subject: sched/deadline: Remove superfluous rq clock update in push_dl_task() The change to call update_rq_clock() before activate_task() commit 840d719604b0 ("sched/deadline: Update rq_clock of later_rq when pushing a task") is no longer needed since commit f4904815f97a ("sched/deadline: Fix double accounting of rq/running bw in push & pull") removed the add_running_bw() before the activate_task(). So we remove some comments that are no longer needed and update rq clock in activate_task(). Signed-off-by: Hao Jia Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Daniel Bristot de Oliveira Link: https://lore.kernel.org/r/20220430085843.62939-3-jiahao.os@bytedance.com --- kernel/sched/deadline.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0ad2818e77a2..936817ae142f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2318,13 +2318,7 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, later_rq->cpu); - - /* - * Update the later_rq clock here, because the clock is used - * by the cpufreq_update_util() inside __add_running_bw(). - */ - update_rq_clock(later_rq); - activate_task(later_rq, next_task, ENQUEUE_NOCLOCK); + activate_task(later_rq, next_task, 0); ret = 1; resched_curr(later_rq); -- cgit From 546a3fee174969ff323d70ff27b1ef181f0d7ceb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 May 2022 13:46:54 +0200 Subject: sched: Reverse sched_class layout Because GCC-12 is fully stupid about array bounds and it's just really hard to get a solid array definition from a linker script, flip the array order to avoid needing negative offsets :-/ This makes the whole relational pointer magic a little less obvious, but alas. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Link: https://lkml.kernel.org/r/YoOLLmLG7HRTXeEm@hirez.programming.kicks-ass.net --- include/asm-generic/vmlinux.lds.h | 12 ++++++------ kernel/sched/core.c | 12 ++++++------ kernel/sched/sched.h | 15 ++++++++------- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 69138e9db787..7515a465ec03 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -126,13 +126,13 @@ */ #define SCHED_DATA \ STRUCT_ALIGN(); \ - __begin_sched_classes = .; \ - *(__idle_sched_class) \ - *(__fair_sched_class) \ - *(__rt_sched_class) \ - *(__dl_sched_class) \ + __sched_class_highest = .; \ *(__stop_sched_class) \ - __end_sched_classes = .; + *(__dl_sched_class) \ + *(__rt_sched_class) \ + *(__fair_sched_class) \ + *(__idle_sched_class) \ + __sched_class_lowest = .; /* The actual configuration determine if the init/exit sections * are handled as text/data or they can be discarded (which diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2c9f5e97682f..66c4e5922fe1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2193,7 +2193,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, p, flags); - else if (p->sched_class > rq->curr->sched_class) + else if (sched_class_above(p->sched_class, rq->curr->sched_class)) resched_curr(rq); /* @@ -5692,7 +5692,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ - if (likely(prev->sched_class <= &fair_sched_class && + if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); @@ -9472,11 +9472,11 @@ void __init sched_init(void) int i; /* Make sure the linker didn't screw up */ - BUG_ON(&idle_sched_class + 1 != &fair_sched_class || - &fair_sched_class + 1 != &rt_sched_class || - &rt_sched_class + 1 != &dl_sched_class); + BUG_ON(&idle_sched_class != &fair_sched_class + 1 || + &fair_sched_class != &rt_sched_class + 1 || + &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP - BUG_ON(&dl_sched_class + 1 != &stop_sched_class); + BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif wait_bit_init(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fe4d1acb7e38..2ce18584dca3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2177,6 +2177,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) * * include/asm-generic/vmlinux.lds.h * + * *CAREFUL* they are laid out in *REVERSE* order!!! + * * Also enforce alignment on the instance, not the type, to guarantee layout. */ #define DEFINE_SCHED_CLASS(name) \ @@ -2185,17 +2187,16 @@ const struct sched_class name##_sched_class \ __section("__" #name "_sched_class") /* Defined in include/asm-generic/vmlinux.lds.h */ -extern struct sched_class __begin_sched_classes[]; -extern struct sched_class __end_sched_classes[]; - -#define sched_class_highest (__end_sched_classes - 1) -#define sched_class_lowest (__begin_sched_classes - 1) +extern struct sched_class __sched_class_highest[]; +extern struct sched_class __sched_class_lowest[]; #define for_class_range(class, _from, _to) \ - for (class = (_from); class != (_to); class--) + for (class = (_from); class < (_to); class++) #define for_each_class(class) \ - for_class_range(class, sched_class_highest, sched_class_lowest) + for_class_range(class, __sched_class_highest, __sched_class_lowest) + +#define sched_class_above(_a, _b) ((_a) < (_b)) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; -- cgit From 991d8d8142cad94f9c5c05db25e67fa83d6f772a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 13 May 2022 11:34:33 +0200 Subject: topology: Remove unused cpu_cluster_mask() default_topology[] uses cpu_clustergroup_mask() for the CLS level (guarded by CONFIG_SCHED_CLUSTER) which is currently provided by x86 (arch/x86/kernel/smpboot.c) and arm64 (drivers/base/arch_topology.c). Fixes: 778c558f49a2c ("sched: Add cluster scheduler level in core and related Kconfig for ARM64") Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Barry Song Link: https://lore.kernel.org/r/20220513093433.425163-1-dietmar.eggemann@arm.com --- include/linux/topology.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/topology.h b/include/linux/topology.h index f19bc3626297..4564faafd0e1 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -240,13 +240,6 @@ static inline const struct cpumask *cpu_smt_mask(int cpu) } #endif -#if defined(CONFIG_SCHED_CLUSTER) && !defined(cpu_cluster_mask) -static inline const struct cpumask *cpu_cluster_mask(int cpu) -{ - return topology_cluster_cpumask(cpu); -} -#endif - static inline const struct cpumask *cpu_cpu_mask(int cpu) { return cpumask_of_node(cpu_to_node(cpu)); -- cgit