Loading include/linux/sched/sysctl.h +1 −0 Original line number Diff line number Diff line Loading @@ -56,6 +56,7 @@ extern unsigned int sysctl_sched_coloc_busy_hyst_max_ms; extern unsigned int sysctl_sched_window_stats_policy; extern unsigned int sysctl_sched_ravg_window_nr_ticks; extern unsigned int sysctl_sched_dynamic_ravg_window_enable; extern unsigned int sysctl_sched_prefer_spread; extern int walt_proc_group_thresholds_handler(struct ctl_table *table, int write, Loading include/trace/events/sched.h +8 −6 Original line number Diff line number Diff line Loading @@ -278,11 +278,11 @@ TRACE_EVENT(sched_load_balance, unsigned long group_mask, int busiest_nr_running, unsigned long imbalance, unsigned int env_flags, int ld_moved, unsigned int balance_interval, int active_balance, int overutilized), int overutilized, int prefer_spread), TP_ARGS(cpu, idle, balance, group_mask, busiest_nr_running, imbalance, env_flags, ld_moved, balance_interval, active_balance, overutilized), active_balance, overutilized, prefer_spread), TP_STRUCT__entry( __field(int, cpu) Loading @@ -296,6 +296,7 @@ TRACE_EVENT(sched_load_balance, __field(unsigned int, balance_interval) __field(int, active_balance) __field(int, overutilized) __field(int, prefer_spread) ), TP_fast_assign( Loading @@ -310,9 +311,10 @@ TRACE_EVENT(sched_load_balance, __entry->balance_interval = balance_interval; __entry->active_balance = active_balance; __entry->overutilized = overutilized; __entry->prefer_spread = prefer_spread; ), TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d sd_overutilized=%d", TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d sd_overutilized=%d prefer_spread=%d", __entry->cpu, __entry->idle == CPU_IDLE ? "idle" : (__entry->idle == CPU_NEWLY_IDLE ? "newly_idle" : "busy"), Loading @@ -320,7 +322,7 @@ TRACE_EVENT(sched_load_balance, __entry->group_mask, __entry->busiest_nr_running, __entry->imbalance, __entry->env_flags, __entry->ld_moved, __entry->balance_interval, __entry->active_balance, __entry->overutilized) __entry->overutilized, __entry->prefer_spread) ); TRACE_EVENT(sched_load_balance_nohz_kick, Loading Loading @@ -1213,7 +1215,7 @@ TRACE_EVENT(sched_compute_energy, TRACE_EVENT(sched_task_util, TP_PROTO(struct task_struct *p, unsigned long candidates, int best_energy_cpu, bool sync, bool need_idle, int fastpath, int best_energy_cpu, bool sync, int need_idle, int fastpath, bool placement_boost, u64 start_t, bool stune_boosted, bool is_rtg, bool rtg_skip_min, int start_cpu), Loading @@ -1230,7 +1232,7 @@ TRACE_EVENT(sched_task_util, __field(int, prev_cpu) __field(int, best_energy_cpu) __field(bool, sync) __field(bool, need_idle) __field(int, need_idle) __field(int, fastpath) __field(int, placement_boost) __field(int, rtg_cpu) Loading kernel/sched/fair.c +154 −44 Original line number Diff line number Diff line Loading @@ -176,6 +176,7 @@ unsigned int sysctl_sched_min_task_util_for_boost = 51; /* 0.68ms default for 20ms window size scaled to 1024 */ unsigned int sysctl_sched_min_task_util_for_colocation = 35; unsigned int sched_task_filter_util = 35; __read_mostly unsigned int sysctl_sched_prefer_spread; #endif unsigned int sched_small_task_threshold = 102; Loading Loading @@ -3928,16 +3929,27 @@ static inline bool task_demand_fits(struct task_struct *p, int cpu) } struct find_best_target_env { bool is_rtg; int placement_boost; bool need_idle; bool boosted; int need_idle; int fastpath; int start_cpu; bool strict_max; int skip_cpu; bool is_rtg; bool boosted; bool strict_max; }; static inline bool prefer_spread_on_idle(int cpu) { if (likely(!sysctl_sched_prefer_spread)) return false; if (is_min_capacity_cpu(cpu)) return sysctl_sched_prefer_spread >= 1; return sysctl_sched_prefer_spread > 1; } static inline void adjust_cpus_for_packing(struct task_struct *p, int *target_cpu, int *best_idle_cpu, int shallowest_idle_cstate, Loading @@ -3949,7 +3961,10 @@ static inline void adjust_cpus_for_packing(struct task_struct *p, if (*best_idle_cpu == -1 || *target_cpu == -1) return; if (task_placement_boost_enabled(p) || fbt_env->need_idle || boosted || if (prefer_spread_on_idle(*best_idle_cpu)) fbt_env->need_idle |= 2; if (fbt_env->need_idle || task_placement_boost_enabled(p) || boosted || shallowest_idle_cstate <= 0) { *target_cpu = -1; return; Loading Loading @@ -7582,6 +7597,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, curr_is_rtg = task_in_related_thread_group(cpu_rq(cpu)->curr); fbt_env.fastpath = 0; fbt_env.need_idle = need_idle; if (trace_sched_task_util_enabled()) start_t = sched_clock(); Loading Loading @@ -7629,7 +7645,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, if (sched_feat(FIND_BEST_TARGET)) { fbt_env.is_rtg = is_rtg; fbt_env.placement_boost = placement_boost; fbt_env.need_idle = need_idle; fbt_env.start_cpu = start_cpu; fbt_env.boosted = boosted; fbt_env.strict_max = is_rtg && Loading Loading @@ -7659,7 +7674,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, if (p->state == TASK_WAKING) delta = task_util(p); #endif if (task_placement_boost_enabled(p) || need_idle || boosted || if (task_placement_boost_enabled(p) || fbt_env.need_idle || boosted || is_rtg || __cpu_overutilized(prev_cpu, delta) || !task_fits_max(p, prev_cpu) || cpu_isolated(prev_cpu)) { best_energy_cpu = cpu; Loading Loading @@ -7703,8 +7718,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, done: trace_sched_task_util(p, cpumask_bits(candidates)[0], best_energy_cpu, sync, need_idle, fbt_env.fastpath, placement_boost, start_t, boosted, is_rtg, get_rtg_status(p), start_cpu); sync, fbt_env.need_idle, fbt_env.fastpath, placement_boost, start_t, boosted, is_rtg, get_rtg_status(p), start_cpu); return best_energy_cpu; Loading Loading @@ -8410,6 +8426,7 @@ struct lb_env { unsigned int loop; unsigned int loop_break; unsigned int loop_max; bool prefer_spread; enum fbq_type fbq_type; enum group_type src_grp_type; Loading Loading @@ -8582,7 +8599,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) struct root_domain *rd = env->dst_rq->rd; if ((rcu_dereference(rd->pd) && !sd_overutilized(env->sd)) && env->idle == CPU_NEWLY_IDLE && env->idle == CPU_NEWLY_IDLE && !env->prefer_spread && !task_in_related_thread_group(p)) { long util_cum_dst, util_cum_src; unsigned long demand; Loading Loading @@ -8754,8 +8771,13 @@ static int detach_tasks(struct lb_env *env) * So only when there is other tasks can be balanced or * there is situation to ignore big task, it is needed * to skip the task load bigger than 2*imbalance. * * And load based checks are skipped for prefer_spread in * finding busiest group, ignore the task's h_load. */ if (((cpu_rq(env->src_cpu)->nr_running > 2) || if (!env->prefer_spread && ((cpu_rq(env->src_cpu)->nr_running > 2) || (env->flags & LBF_IGNORE_BIG_TASKS)) && ((load / 2) > env->imbalance)) goto next; Loading Loading @@ -9549,6 +9571,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; if (env->prefer_spread && env->idle != CPU_NOT_IDLE && (sgs->sum_nr_running > busiest->sum_nr_running) && (sgs->group_util > busiest->group_util)) return true; if (sgs->avg_load <= busiest->avg_load) return false; Loading Loading @@ -9582,6 +9609,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, return false; asym_packing: if (env->prefer_spread && (sgs->sum_nr_running < busiest->sum_nr_running)) return false; /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; Loading Loading @@ -10052,6 +10084,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } /* * If we couldn't find any imbalance, then boost the imbalance * with the group util. */ if (env->prefer_spread && !env->imbalance && env->idle != CPU_NOT_IDLE && busiest->sum_nr_running > busiest->group_weight) env->imbalance = busiest->group_util; } /******* find_busiest_group() helpers end here *********************/ Loading Loading @@ -10439,6 +10480,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, .loop = 0, }; env.prefer_spread = (prefer_spread_on_idle(this_cpu) && !((sd->flags & SD_ASYM_CPUCAPACITY) && !cpumask_test_cpu(this_cpu, &asym_cap_sibling_cpus))); cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); schedstat_inc(sd->lb_count[idle]); Loading Loading @@ -10728,7 +10774,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, busiest ? busiest->nr_running : 0, env.imbalance, env.flags, ld_moved, sd->balance_interval, active_balance, sd_overutilized(sd)); sd_overutilized(sd), env.prefer_spread); return ld_moved; } Loading Loading @@ -10961,7 +11007,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } max_cost += sd->max_newidle_lb_cost; if (!sd_overutilized(sd)) if (!sd_overutilized(sd) && !prefer_spread_on_idle(cpu)) continue; if (!(sd->flags & SD_LOAD_BALANCE)) Loading Loading @@ -11045,6 +11091,71 @@ static inline int on_null_domain(struct rq *rq) } #ifdef CONFIG_NO_HZ_COMMON static inline int find_energy_aware_new_ilb(void) { int ilb = nr_cpu_ids; struct sched_domain *sd; int cpu = raw_smp_processor_id(); cpumask_t idle_cpus, tmp_cpus; struct sched_group *sg; unsigned long ref_cap = capacity_orig_of(cpu); unsigned long best_cap = 0, best_cap_cpu = -1; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); if (!sd) goto out; cpumask_and(&idle_cpus, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)); cpumask_andnot(&idle_cpus, &idle_cpus, cpu_isolated_mask); sg = sd->groups; do { int i; unsigned long cap; cpumask_and(&tmp_cpus, &idle_cpus, sched_group_span(sg)); i = cpumask_first(&tmp_cpus); /* This sg did not have any idle CPUs */ if (i >= nr_cpu_ids) continue; cap = capacity_orig_of(i); /* The first preference is for the same capacity CPU */ if (cap == ref_cap) { ilb = i; break; } /* * When there are no idle CPUs in the same capacity group, * we find the next best capacity CPU. */ if (best_cap > ref_cap) { if (cap > ref_cap && cap < best_cap) { best_cap = cap; best_cap_cpu = i; } continue; } if (cap > best_cap) { best_cap = cap; best_cap_cpu = i; } } while (sg = sg->next, sg != sd->groups); if (best_cap_cpu != -1) ilb = best_cap_cpu; out: rcu_read_unlock(); return ilb; } /* * idle load balancing details * - When one of the busy CPUs notice that there may be an idle rebalancing Loading @@ -11056,33 +11167,10 @@ static inline int on_null_domain(struct rq *rq) static inline int find_new_ilb(void) { int ilb = nr_cpu_ids; struct sched_domain *sd; int cpu = raw_smp_processor_id(); struct rq *rq = cpu_rq(cpu); cpumask_t cpumask; rcu_read_lock(); sd = rcu_dereference_check_sched_domain(rq->sd); if (sd) { cpumask_and(&cpumask, nohz.idle_cpus_mask, sched_domain_span(sd)); cpumask_andnot(&cpumask, &cpumask, cpu_isolated_mask); ilb = cpumask_first(&cpumask); } rcu_read_unlock(); int ilb; if (sd && (ilb >= nr_cpu_ids || !idle_cpu(ilb))) { if (!static_branch_unlikely(&sched_energy_present) || (capacity_orig_of(cpu) == cpu_rq(cpu)->rd->max_cpu_capacity.val || cpu_overutilized(cpu))) { cpumask_andnot(&cpumask, nohz.idle_cpus_mask, cpu_isolated_mask); ilb = cpumask_first(&cpumask); } } if (static_branch_likely(&sched_energy_present)) return find_energy_aware_new_ilb(); for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { Loading Loading @@ -11166,7 +11254,19 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; if (rq->nr_running >= 2 || rq->misfit_task_load) { /* * With EAS, no-hz idle balance is allowed only when the CPU * is overutilized and has 2 tasks. The misfit task migration * happens from the tickpath. */ if (static_branch_likely(&sched_energy_present)) { if (rq->nr_running >= 2 && (cpu_overutilized(cpu) || prefer_spread_on_idle(cpu))) flags = NOHZ_KICK_MASK; goto out; } if (rq->nr_running >= 2) { flags = NOHZ_KICK_MASK; goto out; } Loading Loading @@ -11531,6 +11631,11 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; u64 avg_idle = this_rq->avg_idle; bool prefer_spread = prefer_spread_on_idle(this_cpu); bool force_lb = (!is_min_capacity_cpu(this_cpu) && silver_has_big_tasks() && (atomic_read(&this_rq->nr_iowait) == 0)); if (cpu_isolated(this_cpu)) return 0; Loading @@ -11546,8 +11651,8 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) */ if (!cpu_active(this_cpu)) return 0; if (!is_min_capacity_cpu(this_cpu) && silver_has_big_tasks() && (atomic_read(&this_rq->nr_iowait) == 0)) if (force_lb || prefer_spread) avg_idle = ULLONG_MAX; /* * This is OK, because current is on_cpu, which avoids it being picked Loading Loading @@ -11582,6 +11687,11 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (!(sd->flags & SD_LOAD_BALANCE)) continue; if (prefer_spread && !force_lb && (sd->flags & SD_ASYM_CPUCAPACITY) && !(cpumask_test_cpu(this_cpu, &asym_cap_sibling_cpus))) avg_idle = this_rq->avg_idle; if (avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); break; Loading kernel/sysctl.c +9 −0 Original line number Diff line number Diff line Loading @@ -551,6 +551,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_updown_migrate_handler, }, { .procname = "sched_prefer_spread", .data = &sysctl_sched_prefer_spread, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &two, }, #endif #ifdef CONFIG_SCHED_DEBUG { Loading Loading
include/linux/sched/sysctl.h +1 −0 Original line number Diff line number Diff line Loading @@ -56,6 +56,7 @@ extern unsigned int sysctl_sched_coloc_busy_hyst_max_ms; extern unsigned int sysctl_sched_window_stats_policy; extern unsigned int sysctl_sched_ravg_window_nr_ticks; extern unsigned int sysctl_sched_dynamic_ravg_window_enable; extern unsigned int sysctl_sched_prefer_spread; extern int walt_proc_group_thresholds_handler(struct ctl_table *table, int write, Loading
include/trace/events/sched.h +8 −6 Original line number Diff line number Diff line Loading @@ -278,11 +278,11 @@ TRACE_EVENT(sched_load_balance, unsigned long group_mask, int busiest_nr_running, unsigned long imbalance, unsigned int env_flags, int ld_moved, unsigned int balance_interval, int active_balance, int overutilized), int overutilized, int prefer_spread), TP_ARGS(cpu, idle, balance, group_mask, busiest_nr_running, imbalance, env_flags, ld_moved, balance_interval, active_balance, overutilized), active_balance, overutilized, prefer_spread), TP_STRUCT__entry( __field(int, cpu) Loading @@ -296,6 +296,7 @@ TRACE_EVENT(sched_load_balance, __field(unsigned int, balance_interval) __field(int, active_balance) __field(int, overutilized) __field(int, prefer_spread) ), TP_fast_assign( Loading @@ -310,9 +311,10 @@ TRACE_EVENT(sched_load_balance, __entry->balance_interval = balance_interval; __entry->active_balance = active_balance; __entry->overutilized = overutilized; __entry->prefer_spread = prefer_spread; ), TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d sd_overutilized=%d", TP_printk("cpu=%d state=%s balance=%d group=%#lx busy_nr=%d imbalance=%ld flags=%#x ld_moved=%d bal_int=%d active_balance=%d sd_overutilized=%d prefer_spread=%d", __entry->cpu, __entry->idle == CPU_IDLE ? "idle" : (__entry->idle == CPU_NEWLY_IDLE ? "newly_idle" : "busy"), Loading @@ -320,7 +322,7 @@ TRACE_EVENT(sched_load_balance, __entry->group_mask, __entry->busiest_nr_running, __entry->imbalance, __entry->env_flags, __entry->ld_moved, __entry->balance_interval, __entry->active_balance, __entry->overutilized) __entry->overutilized, __entry->prefer_spread) ); TRACE_EVENT(sched_load_balance_nohz_kick, Loading Loading @@ -1213,7 +1215,7 @@ TRACE_EVENT(sched_compute_energy, TRACE_EVENT(sched_task_util, TP_PROTO(struct task_struct *p, unsigned long candidates, int best_energy_cpu, bool sync, bool need_idle, int fastpath, int best_energy_cpu, bool sync, int need_idle, int fastpath, bool placement_boost, u64 start_t, bool stune_boosted, bool is_rtg, bool rtg_skip_min, int start_cpu), Loading @@ -1230,7 +1232,7 @@ TRACE_EVENT(sched_task_util, __field(int, prev_cpu) __field(int, best_energy_cpu) __field(bool, sync) __field(bool, need_idle) __field(int, need_idle) __field(int, fastpath) __field(int, placement_boost) __field(int, rtg_cpu) Loading
kernel/sched/fair.c +154 −44 Original line number Diff line number Diff line Loading @@ -176,6 +176,7 @@ unsigned int sysctl_sched_min_task_util_for_boost = 51; /* 0.68ms default for 20ms window size scaled to 1024 */ unsigned int sysctl_sched_min_task_util_for_colocation = 35; unsigned int sched_task_filter_util = 35; __read_mostly unsigned int sysctl_sched_prefer_spread; #endif unsigned int sched_small_task_threshold = 102; Loading Loading @@ -3928,16 +3929,27 @@ static inline bool task_demand_fits(struct task_struct *p, int cpu) } struct find_best_target_env { bool is_rtg; int placement_boost; bool need_idle; bool boosted; int need_idle; int fastpath; int start_cpu; bool strict_max; int skip_cpu; bool is_rtg; bool boosted; bool strict_max; }; static inline bool prefer_spread_on_idle(int cpu) { if (likely(!sysctl_sched_prefer_spread)) return false; if (is_min_capacity_cpu(cpu)) return sysctl_sched_prefer_spread >= 1; return sysctl_sched_prefer_spread > 1; } static inline void adjust_cpus_for_packing(struct task_struct *p, int *target_cpu, int *best_idle_cpu, int shallowest_idle_cstate, Loading @@ -3949,7 +3961,10 @@ static inline void adjust_cpus_for_packing(struct task_struct *p, if (*best_idle_cpu == -1 || *target_cpu == -1) return; if (task_placement_boost_enabled(p) || fbt_env->need_idle || boosted || if (prefer_spread_on_idle(*best_idle_cpu)) fbt_env->need_idle |= 2; if (fbt_env->need_idle || task_placement_boost_enabled(p) || boosted || shallowest_idle_cstate <= 0) { *target_cpu = -1; return; Loading Loading @@ -7582,6 +7597,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, curr_is_rtg = task_in_related_thread_group(cpu_rq(cpu)->curr); fbt_env.fastpath = 0; fbt_env.need_idle = need_idle; if (trace_sched_task_util_enabled()) start_t = sched_clock(); Loading Loading @@ -7629,7 +7645,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, if (sched_feat(FIND_BEST_TARGET)) { fbt_env.is_rtg = is_rtg; fbt_env.placement_boost = placement_boost; fbt_env.need_idle = need_idle; fbt_env.start_cpu = start_cpu; fbt_env.boosted = boosted; fbt_env.strict_max = is_rtg && Loading Loading @@ -7659,7 +7674,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, if (p->state == TASK_WAKING) delta = task_util(p); #endif if (task_placement_boost_enabled(p) || need_idle || boosted || if (task_placement_boost_enabled(p) || fbt_env.need_idle || boosted || is_rtg || __cpu_overutilized(prev_cpu, delta) || !task_fits_max(p, prev_cpu) || cpu_isolated(prev_cpu)) { best_energy_cpu = cpu; Loading Loading @@ -7703,8 +7718,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, done: trace_sched_task_util(p, cpumask_bits(candidates)[0], best_energy_cpu, sync, need_idle, fbt_env.fastpath, placement_boost, start_t, boosted, is_rtg, get_rtg_status(p), start_cpu); sync, fbt_env.need_idle, fbt_env.fastpath, placement_boost, start_t, boosted, is_rtg, get_rtg_status(p), start_cpu); return best_energy_cpu; Loading Loading @@ -8410,6 +8426,7 @@ struct lb_env { unsigned int loop; unsigned int loop_break; unsigned int loop_max; bool prefer_spread; enum fbq_type fbq_type; enum group_type src_grp_type; Loading Loading @@ -8582,7 +8599,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) struct root_domain *rd = env->dst_rq->rd; if ((rcu_dereference(rd->pd) && !sd_overutilized(env->sd)) && env->idle == CPU_NEWLY_IDLE && env->idle == CPU_NEWLY_IDLE && !env->prefer_spread && !task_in_related_thread_group(p)) { long util_cum_dst, util_cum_src; unsigned long demand; Loading Loading @@ -8754,8 +8771,13 @@ static int detach_tasks(struct lb_env *env) * So only when there is other tasks can be balanced or * there is situation to ignore big task, it is needed * to skip the task load bigger than 2*imbalance. * * And load based checks are skipped for prefer_spread in * finding busiest group, ignore the task's h_load. */ if (((cpu_rq(env->src_cpu)->nr_running > 2) || if (!env->prefer_spread && ((cpu_rq(env->src_cpu)->nr_running > 2) || (env->flags & LBF_IGNORE_BIG_TASKS)) && ((load / 2) > env->imbalance)) goto next; Loading Loading @@ -9549,6 +9571,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, if (sgs->group_type < busiest->group_type) return false; if (env->prefer_spread && env->idle != CPU_NOT_IDLE && (sgs->sum_nr_running > busiest->sum_nr_running) && (sgs->group_util > busiest->group_util)) return true; if (sgs->avg_load <= busiest->avg_load) return false; Loading Loading @@ -9582,6 +9609,11 @@ static bool update_sd_pick_busiest(struct lb_env *env, return false; asym_packing: if (env->prefer_spread && (sgs->sum_nr_running < busiest->sum_nr_running)) return false; /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) return true; Loading Loading @@ -10052,6 +10084,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s return fix_small_imbalance(env, sds); } /* * If we couldn't find any imbalance, then boost the imbalance * with the group util. */ if (env->prefer_spread && !env->imbalance && env->idle != CPU_NOT_IDLE && busiest->sum_nr_running > busiest->group_weight) env->imbalance = busiest->group_util; } /******* find_busiest_group() helpers end here *********************/ Loading Loading @@ -10439,6 +10480,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, .loop = 0, }; env.prefer_spread = (prefer_spread_on_idle(this_cpu) && !((sd->flags & SD_ASYM_CPUCAPACITY) && !cpumask_test_cpu(this_cpu, &asym_cap_sibling_cpus))); cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); schedstat_inc(sd->lb_count[idle]); Loading Loading @@ -10728,7 +10774,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, busiest ? busiest->nr_running : 0, env.imbalance, env.flags, ld_moved, sd->balance_interval, active_balance, sd_overutilized(sd)); sd_overutilized(sd), env.prefer_spread); return ld_moved; } Loading Loading @@ -10961,7 +11007,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) } max_cost += sd->max_newidle_lb_cost; if (!sd_overutilized(sd)) if (!sd_overutilized(sd) && !prefer_spread_on_idle(cpu)) continue; if (!(sd->flags & SD_LOAD_BALANCE)) Loading Loading @@ -11045,6 +11091,71 @@ static inline int on_null_domain(struct rq *rq) } #ifdef CONFIG_NO_HZ_COMMON static inline int find_energy_aware_new_ilb(void) { int ilb = nr_cpu_ids; struct sched_domain *sd; int cpu = raw_smp_processor_id(); cpumask_t idle_cpus, tmp_cpus; struct sched_group *sg; unsigned long ref_cap = capacity_orig_of(cpu); unsigned long best_cap = 0, best_cap_cpu = -1; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); if (!sd) goto out; cpumask_and(&idle_cpus, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)); cpumask_andnot(&idle_cpus, &idle_cpus, cpu_isolated_mask); sg = sd->groups; do { int i; unsigned long cap; cpumask_and(&tmp_cpus, &idle_cpus, sched_group_span(sg)); i = cpumask_first(&tmp_cpus); /* This sg did not have any idle CPUs */ if (i >= nr_cpu_ids) continue; cap = capacity_orig_of(i); /* The first preference is for the same capacity CPU */ if (cap == ref_cap) { ilb = i; break; } /* * When there are no idle CPUs in the same capacity group, * we find the next best capacity CPU. */ if (best_cap > ref_cap) { if (cap > ref_cap && cap < best_cap) { best_cap = cap; best_cap_cpu = i; } continue; } if (cap > best_cap) { best_cap = cap; best_cap_cpu = i; } } while (sg = sg->next, sg != sd->groups); if (best_cap_cpu != -1) ilb = best_cap_cpu; out: rcu_read_unlock(); return ilb; } /* * idle load balancing details * - When one of the busy CPUs notice that there may be an idle rebalancing Loading @@ -11056,33 +11167,10 @@ static inline int on_null_domain(struct rq *rq) static inline int find_new_ilb(void) { int ilb = nr_cpu_ids; struct sched_domain *sd; int cpu = raw_smp_processor_id(); struct rq *rq = cpu_rq(cpu); cpumask_t cpumask; rcu_read_lock(); sd = rcu_dereference_check_sched_domain(rq->sd); if (sd) { cpumask_and(&cpumask, nohz.idle_cpus_mask, sched_domain_span(sd)); cpumask_andnot(&cpumask, &cpumask, cpu_isolated_mask); ilb = cpumask_first(&cpumask); } rcu_read_unlock(); int ilb; if (sd && (ilb >= nr_cpu_ids || !idle_cpu(ilb))) { if (!static_branch_unlikely(&sched_energy_present) || (capacity_orig_of(cpu) == cpu_rq(cpu)->rd->max_cpu_capacity.val || cpu_overutilized(cpu))) { cpumask_andnot(&cpumask, nohz.idle_cpus_mask, cpu_isolated_mask); ilb = cpumask_first(&cpumask); } } if (static_branch_likely(&sched_energy_present)) return find_energy_aware_new_ilb(); for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { Loading Loading @@ -11166,7 +11254,19 @@ static void nohz_balancer_kick(struct rq *rq) if (time_before(now, nohz.next_balance)) goto out; if (rq->nr_running >= 2 || rq->misfit_task_load) { /* * With EAS, no-hz idle balance is allowed only when the CPU * is overutilized and has 2 tasks. The misfit task migration * happens from the tickpath. */ if (static_branch_likely(&sched_energy_present)) { if (rq->nr_running >= 2 && (cpu_overutilized(cpu) || prefer_spread_on_idle(cpu))) flags = NOHZ_KICK_MASK; goto out; } if (rq->nr_running >= 2) { flags = NOHZ_KICK_MASK; goto out; } Loading Loading @@ -11531,6 +11631,11 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; u64 avg_idle = this_rq->avg_idle; bool prefer_spread = prefer_spread_on_idle(this_cpu); bool force_lb = (!is_min_capacity_cpu(this_cpu) && silver_has_big_tasks() && (atomic_read(&this_rq->nr_iowait) == 0)); if (cpu_isolated(this_cpu)) return 0; Loading @@ -11546,8 +11651,8 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) */ if (!cpu_active(this_cpu)) return 0; if (!is_min_capacity_cpu(this_cpu) && silver_has_big_tasks() && (atomic_read(&this_rq->nr_iowait) == 0)) if (force_lb || prefer_spread) avg_idle = ULLONG_MAX; /* * This is OK, because current is on_cpu, which avoids it being picked Loading Loading @@ -11582,6 +11687,11 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) if (!(sd->flags & SD_LOAD_BALANCE)) continue; if (prefer_spread && !force_lb && (sd->flags & SD_ASYM_CPUCAPACITY) && !(cpumask_test_cpu(this_cpu, &asym_cap_sibling_cpus))) avg_idle = this_rq->avg_idle; if (avg_idle < curr_cost + sd->max_newidle_lb_cost) { update_next_balance(sd, &next_balance); break; Loading
kernel/sysctl.c +9 −0 Original line number Diff line number Diff line Loading @@ -551,6 +551,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_updown_migrate_handler, }, { .procname = "sched_prefer_spread", .data = &sysctl_sched_prefer_spread, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &two, }, #endif #ifdef CONFIG_SCHED_DEBUG { Loading