diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 113 |
1 files changed, 83 insertions, 30 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5146163bfabb..ee0664c9d291 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,7 +20,38 @@ * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ +#include <linux/energy_model.h> +#include <linux/mmap_lock.h> +#include <linux/hugetlb_inline.h> +#include <linux/jiffies.h> +#include <linux/mm_api.h> +#include <linux/highmem.h> +#include <linux/spinlock_api.h> +#include <linux/cpumask_api.h> +#include <linux/lockdep_api.h> +#include <linux/softirq.h> +#include <linux/refcount_api.h> +#include <linux/topology.h> +#include <linux/sched/clock.h> +#include <linux/sched/cond_resched.h> +#include <linux/sched/cputime.h> +#include <linux/sched/isolation.h> + +#include <linux/cpuidle.h> +#include <linux/interrupt.h> +#include <linux/mempolicy.h> +#include <linux/mutex_api.h> +#include <linux/profile.h> +#include <linux/psi.h> +#include <linux/ratelimit.h> + +#include <asm/switch_to.h> + +#include <linux/sched/cond_resched.h> + #include "sched.h" +#include "stats.h" +#include "autogroup.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -1259,10 +1290,10 @@ static bool numa_is_active_node(int nid, struct numa_group *ng) /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, - int maxdist, bool task) + int lim_dist, bool task) { unsigned long score = 0; - int node; + int node, max_dist; /* * All nodes are directly connected, and the same distance @@ -1271,6 +1302,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, if (sched_numa_topology_type == NUMA_DIRECT) return 0; + /* sched_max_numa_distance may be changed in parallel. */ + max_dist = READ_ONCE(sched_max_numa_distance); /* * This code is called for each node, introducing N^2 complexity, * which should be ok given the number of nodes rarely exceeds 8. @@ -1283,7 +1316,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * The furthest away nodes in the system are not interesting * for placement; nid was already counted. */ - if (dist == sched_max_numa_distance || node == nid) + if (dist >= max_dist || node == nid) continue; /* @@ -1293,8 +1326,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * "hoplimit", only nodes closer by than "hoplimit" are part * of each group. Skip other nodes. */ - if (sched_numa_topology_type == NUMA_BACKPLANE && - dist >= maxdist) + if (sched_numa_topology_type == NUMA_BACKPLANE && dist >= lim_dist) continue; /* Add up the faults from nearby nodes. */ @@ -1312,8 +1344,8 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, * This seems to result in good task placement. */ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) { - faults *= (sched_max_numa_distance - dist); - faults /= (sched_max_numa_distance - LOCAL_DISTANCE); + faults *= (max_dist - dist); + faults /= (max_dist - LOCAL_DISTANCE); } score += faults; @@ -1489,6 +1521,7 @@ struct task_numa_env { int src_cpu, src_nid; int dst_cpu, dst_nid; + int imb_numa_nr; struct numa_stats src_stats, dst_stats; @@ -1503,7 +1536,7 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight); + int dst_running, int imb_numa_nr); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1884,7 +1917,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); imbalance = adjust_numa_imbalance(imbalance, dst_running, - env->dst_stats.weight); + env->imb_numa_nr); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -1949,8 +1982,10 @@ static int task_numa_migrate(struct task_struct *p) */ rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); - if (sd) + if (sd) { env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; + env.imb_numa_nr = sd->imb_numa_nr; + } rcu_read_unlock(); /* @@ -1985,7 +2020,7 @@ static int task_numa_migrate(struct task_struct *p) */ ng = deref_curr_numa_group(p); if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) { - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -2083,13 +2118,13 @@ static void numa_group_count_active_nodes(struct numa_group *numa_group) unsigned long faults, max_faults = 0; int nid, active_nodes = 0; - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults > max_faults) max_faults = faults; } - for_each_online_node(nid) { + for_each_node_state(nid, N_CPU) { faults = group_faults_cpu(numa_group, nid); if (faults * ACTIVE_NODE_FRACTION > max_faults) active_nodes++; @@ -2243,7 +2278,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) dist = sched_max_numa_distance; - for_each_online_node(node) { + for_each_node_state(node, N_CPU) { score = group_weight(p, node, dist); if (score > max_score) { max_score = score; @@ -2262,7 +2297,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) * inside the highest scoring group of nodes. The nodemask tricks * keep the complexity of the search down. */ - nodes = node_online_map; + nodes = node_states[N_CPU]; for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) { unsigned long max_faults = 0; nodemask_t max_group = NODE_MASK_NONE; @@ -2401,6 +2436,21 @@ static void task_numa_placement(struct task_struct *p) } } + /* Cannot migrate task to CPU-less node */ + if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) { + int near_nid = max_nid; + int distance, near_distance = INT_MAX; + + for_each_node_state(nid, N_CPU) { + distance = node_distance(max_nid, nid); + if (distance < near_distance) { + near_nid = nid; + near_distance = distance; + } + } + max_nid = near_nid; + } + if (ng) { numa_group_count_active_nodes(ng); spin_unlock_irq(group_lock); @@ -2825,6 +2875,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; + p->numa_pages_migrated = 0; + p->total_numa_faults = 0; RCU_INIT_POINTER(p->numa_group, NULL); p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; @@ -9040,9 +9092,9 @@ static bool update_pick_idlest(struct sched_group *idlest, * This is an approximation as the number of running tasks may not be * related to the number of busy CPUs due to sched_setaffinity. */ -static inline bool allow_numa_imbalance(int dst_running, int dst_weight) +static inline bool allow_numa_imbalance(int running, int imb_numa_nr) { - return (dst_running < (dst_weight >> 2)); + return running <= imb_numa_nr; } /* @@ -9176,12 +9228,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) return idlest; #endif /* - * Otherwise, keep the task on this node to stay close - * its wakeup source and improve locality. If there is - * a real need of migration, periodic load balance will - * take care of it. + * Otherwise, keep the task close to the wakeup source + * and improve locality if the number of running tasks + * would remain below threshold where an imbalance is + * allowed. If there is a real need of migration, + * periodic load balance will take care of it. */ - if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) + if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr)) return NULL; } @@ -9273,9 +9326,9 @@ next_group: #define NUMA_IMBALANCE_MIN 2 static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int dst_weight) + int dst_running, int imb_numa_nr) { - if (!allow_numa_imbalance(dst_running, dst_weight)) + if (!allow_numa_imbalance(dst_running, imb_numa_nr)) return imbalance; /* @@ -9387,7 +9440,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running, busiest->group_weight); + local->sum_nr_running + 1, env->sd->imb_numa_nr); } return; @@ -10351,7 +10404,7 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set + * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set * anywhere yet. */ @@ -10360,7 +10413,7 @@ static inline int find_new_ilb(void) int ilb; const struct cpumask *hk_mask; - hk_mask = housekeeping_cpumask(HK_FLAG_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_MISC); for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { @@ -10376,7 +10429,7 @@ static inline int find_new_ilb(void) /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). + * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -10589,7 +10642,7 @@ void nohz_balance_enter_idle(int cpu) return; /* Spare idle load balancing on CPUs that don't want to be disturbed: */ - if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(cpu, HK_TYPE_SCHED)) return; /* @@ -10805,7 +10858,7 @@ static void nohz_newidle_balance(struct rq *this_rq) * This CPU doesn't want to be disturbed by scheduler * housekeeping */ - if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) + if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED)) return; /* Will wake up very soon. No time for doing anything else*/ |