Loading kernel/sched/fair.c +73 −21 Original line number Diff line number Diff line Loading @@ -9332,6 +9332,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) Loading Loading @@ -9507,17 +9508,29 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) } /* * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity than sched_group ref. */ static inline bool group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->min_capacity * sched_capacity_margin_up[group_first_cpu(sg)] < ref->sgc->min_capacity * 1024; } /* * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity_orig than sched_group ref. */ static inline bool group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->max_capacity * sched_capacity_margin_up[group_first_cpu(sg)] < ref->sgc->max_capacity * 1024; } /* * group_similar_cpu_capacity: Returns true if the minimum capacity of the * compared groups differ by less than 12.5%. Loading Loading @@ -9554,7 +9567,7 @@ group_type group_classify(struct sched_group *group, * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. * @overload: Indicate pullable load (e.g. >1 runnable task). * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, Loading Loading @@ -9599,8 +9612,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; *overload = 1; } if (cpu_overutilized(i)) { *overutilized = true; Loading Loading @@ -9656,9 +9672,12 @@ static bool update_sd_pick_busiest(struct lb_env *env, /* * Don't try to pull misfit tasks we can't help. * We can use max_capacity here as reduction in capacity on some * cpus in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually). */ if (sgs->group_type == group_misfit_task && (!group_smaller_cpu_capacity(sg, sds->local) || (!group_smaller_max_cpu_capacity(sg, sds->local) || !group_has_capacity(env, &sds->local_stat))) return false; Loading @@ -9681,7 +9700,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && group_smaller_cpu_capacity(sds->local, sg)) group_smaller_min_cpu_capacity(sds->local, sg)) return false; /* Loading @@ -9693,6 +9712,13 @@ static bool update_sd_pick_busiest(struct lb_env *env, !group_similar_cpu_capacity(sds->local, sg)) return false; /* * If we have more than one misfit sg go with the biggest misfit. */ if (sgs->group_type == group_misfit_task && sgs->group_misfit_task_load < busiest->group_misfit_task_load) return false; asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) Loading Loading @@ -9773,11 +9799,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; int load_idx; bool overload = false, overutilized = false, misfit_task = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE) { Loading Loading @@ -9865,8 +9889,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; if (READ_ONCE(env->dst_rq->rd->overload) != overload) WRITE_ONCE(env->dst_rq->rd->overload, overload); } if (overutilized) Loading Loading @@ -10112,8 +10136,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; /* Boost imbalance to allow misfit task to be balanced. */ if (busiest->group_type == group_misfit_task) { /* Boost imbalance to allow misfit task to be balanced. * Always do this if we are doing a NEWLY_IDLE balance * on the assumption that any tasks we have must not be * long-running (and hence we cannot rely upon load). * However if we are not idle, we should assume the tasks * we have are longer running and not override load-based * calculations above unless we are sure that the local * group is underutilized. */ if (busiest->group_type == group_misfit_task && (env->idle == CPU_NEWLY_IDLE || local->sum_nr_running < local->group_weight)) { env->imbalance = max_t(long, env->imbalance, busiest->group_misfit_task_load); } Loading Loading @@ -10214,7 +10248,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; /* Misfitting tasks should be dealt with regardless of the avg load */ /* Misfit tasks should be dealt with regardless of the avg load */ if (busiest->group_type == group_misfit_task) goto force_balance; Loading Loading @@ -10304,15 +10338,30 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; /* * For ASYM_CPUCAPACITY domains with misfit tasks we ignore * load. * For ASYM_CPUCAPACITY domains with misfit tasks we simply * seek the "biggest" misfit task. */ if (env->src_grp_type == group_misfit_task && rq->misfit_task_load) return rq; if (env->src_grp_type == group_misfit_task) { if (rq->misfit_task_load > busiest_load) { busiest_load = rq->misfit_task_load; busiest = rq; } continue; } capacity = capacity_of(i); /* * For ASYM_CPUCAPACITY domains, don't pick a cpu that could * eventually lead to active_balancing high->low capacity. * Higher per-cpu capacity is considered better than balancing * average load. */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && rq->nr_running == 1) continue; wl = weighted_cpuload(rq); /* Loading Loading @@ -10390,6 +10439,9 @@ static int need_active_balance(struct lb_env *env) return 1; } if (env->src_grp_type == group_misfit_task) return 1; return unlikely(sd->nr_balance_failed > sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD); } Loading Loading @@ -10814,7 +10866,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || !this_rq->rd->overload) { !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) Loading kernel/sched/sched.h +8 −4 Original line number Diff line number Diff line Loading @@ -695,8 +695,12 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; /* Indicate more than one runnable task for any CPU */ bool overload; /* * Indicate pullable load on at least one CPU, e.g: * - More than one runnable task * - Running task is misfit */ int overload; /* * The bit corresponding to a CPU gets set here if such CPU has more Loading Loading @@ -1761,8 +1765,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP if (!rq->rd->overload) rq->rd->overload = true; if (!READ_ONCE(rq->rd->overload)) WRITE_ONCE(rq->rd->overload, 1); #endif } Loading kernel/sched/topology.c +29 −4 Original line number Diff line number Diff line Loading @@ -915,6 +915,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; return sg; } Loading Loading @@ -1290,7 +1291,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING | 1*SD_PREFER_SIBLING | 0*SD_NUMA | sd_flags , Loading @@ -1309,6 +1310,26 @@ sd_init(struct sched_domain_topology_level *tl, cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); sd_id = cpumask_first(sched_domain_span(sd)); /* * Check if cpu_map eclipses cpu capacity asymmetry. */ if (sd->flags & SD_ASYM_CPUCAPACITY) { long capacity = arch_scale_cpu_capacity(NULL, sd_id); bool disable = true; int i; for_each_cpu(i, sched_domain_span(sd)) { if (capacity != arch_scale_cpu_capacity(NULL, i)) { disable = false; break; } } if (disable) sd->flags &= ~SD_ASYM_CPUCAPACITY; } /* * Convert topological properties into behaviour. */ Loading @@ -1316,12 +1337,17 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_ASYM_CPUCAPACITY) { struct sched_domain *t = sd; /* * Don't attempt to spread across cpus of different capacities. */ if (sd->child) sd->child->flags &= ~SD_PREFER_SIBLING; for_each_lower_domain(t) t->flags |= SD_BALANCE_WAKE; } if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ Loading @@ -1336,6 +1362,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->busy_idx = 3; sd->idle_idx = 2; sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { sd->flags &= ~(SD_BALANCE_EXEC | Loading @@ -1345,7 +1372,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { sd->flags |= SD_PREFER_SIBLING; sd->cache_nice_tries = 1; sd->busy_idx = 2; sd->idle_idx = 1; Loading Loading @@ -2102,4 +2128,3 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], mutex_unlock(&sched_domains_mutex); } Loading
kernel/sched/fair.c +73 −21 Original line number Diff line number Diff line Loading @@ -9332,6 +9332,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) cpu_rq(cpu)->cpu_capacity = capacity; sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; sdg->sgc->max_capacity = capacity; } void update_group_capacity(struct sched_domain *sd, int cpu) Loading Loading @@ -9507,17 +9508,29 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) } /* * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity than sched_group ref. */ static inline bool group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->min_capacity * sched_capacity_margin_up[group_first_cpu(sg)] < ref->sgc->min_capacity * 1024; } /* * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller * per-CPU capacity_orig than sched_group ref. */ static inline bool group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) { return sg->sgc->max_capacity * sched_capacity_margin_up[group_first_cpu(sg)] < ref->sgc->max_capacity * 1024; } /* * group_similar_cpu_capacity: Returns true if the minimum capacity of the * compared groups differ by less than 12.5%. Loading Loading @@ -9554,7 +9567,7 @@ group_type group_classify(struct sched_group *group, * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. * @sgs: variable to hold the statistics for this group. * @overload: Indicate more than one runnable task for any CPU. * @overload: Indicate pullable load (e.g. >1 runnable task). * @overutilized: Indicate overutilization for any CPU. */ static inline void update_sg_lb_stats(struct lb_env *env, Loading Loading @@ -9599,8 +9612,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) sgs->group_misfit_task_load < rq->misfit_task_load) { sgs->group_misfit_task_load = rq->misfit_task_load; *overload = 1; } if (cpu_overutilized(i)) { *overutilized = true; Loading Loading @@ -9656,9 +9672,12 @@ static bool update_sd_pick_busiest(struct lb_env *env, /* * Don't try to pull misfit tasks we can't help. * We can use max_capacity here as reduction in capacity on some * cpus in the group should either be possible to resolve * internally or be covered by avg_load imbalance (eventually). */ if (sgs->group_type == group_misfit_task && (!group_smaller_cpu_capacity(sg, sds->local) || (!group_smaller_max_cpu_capacity(sg, sds->local) || !group_has_capacity(env, &sds->local_stat))) return false; Loading @@ -9681,7 +9700,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * power/energy consequences are not considered. */ if (sgs->sum_nr_running <= sgs->group_weight && group_smaller_cpu_capacity(sds->local, sg)) group_smaller_min_cpu_capacity(sds->local, sg)) return false; /* Loading @@ -9693,6 +9712,13 @@ static bool update_sd_pick_busiest(struct lb_env *env, !group_similar_cpu_capacity(sds->local, sg)) return false; /* * If we have more than one misfit sg go with the biggest misfit. */ if (sgs->group_type == group_misfit_task && sgs->group_misfit_task_load < busiest->group_misfit_task_load) return false; asym_packing: /* This is the busiest node in its class. */ if (!(env->sd->flags & SD_ASYM_PACKING)) Loading Loading @@ -9773,11 +9799,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; int load_idx; bool overload = false, overutilized = false, misfit_task = false; if (child && child->flags & SD_PREFER_SIBLING) prefer_sibling = 1; bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING; #ifdef CONFIG_NO_HZ_COMMON if (env->idle == CPU_NEWLY_IDLE) { Loading Loading @@ -9865,8 +9889,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd if (!lb_sd_parent(env->sd)) { /* update overload indicator if we are at root domain */ if (env->dst_rq->rd->overload != overload) env->dst_rq->rd->overload = overload; if (READ_ONCE(env->dst_rq->rd->overload) != overload) WRITE_ONCE(env->dst_rq->rd->overload, overload); } if (overutilized) Loading Loading @@ -10112,8 +10136,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (sds->avg_load - local->avg_load) * local->group_capacity ) / SCHED_CAPACITY_SCALE; /* Boost imbalance to allow misfit task to be balanced. */ if (busiest->group_type == group_misfit_task) { /* Boost imbalance to allow misfit task to be balanced. * Always do this if we are doing a NEWLY_IDLE balance * on the assumption that any tasks we have must not be * long-running (and hence we cannot rely upon load). * However if we are not idle, we should assume the tasks * we have are longer running and not override load-based * calculations above unless we are sure that the local * group is underutilized. */ if (busiest->group_type == group_misfit_task && (env->idle == CPU_NEWLY_IDLE || local->sum_nr_running < local->group_weight)) { env->imbalance = max_t(long, env->imbalance, busiest->group_misfit_task_load); } Loading Loading @@ -10214,7 +10248,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) busiest->group_no_capacity) goto force_balance; /* Misfitting tasks should be dealt with regardless of the avg load */ /* Misfit tasks should be dealt with regardless of the avg load */ if (busiest->group_type == group_misfit_task) goto force_balance; Loading Loading @@ -10304,15 +10338,30 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; /* * For ASYM_CPUCAPACITY domains with misfit tasks we ignore * load. * For ASYM_CPUCAPACITY domains with misfit tasks we simply * seek the "biggest" misfit task. */ if (env->src_grp_type == group_misfit_task && rq->misfit_task_load) return rq; if (env->src_grp_type == group_misfit_task) { if (rq->misfit_task_load > busiest_load) { busiest_load = rq->misfit_task_load; busiest = rq; } continue; } capacity = capacity_of(i); /* * For ASYM_CPUCAPACITY domains, don't pick a cpu that could * eventually lead to active_balancing high->low capacity. * Higher per-cpu capacity is considered better than balancing * average load. */ if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && rq->nr_running == 1) continue; wl = weighted_cpuload(rq); /* Loading Loading @@ -10390,6 +10439,9 @@ static int need_active_balance(struct lb_env *env) return 1; } if (env->src_grp_type == group_misfit_task) return 1; return unlikely(sd->nr_balance_failed > sd->cache_nice_tries + NEED_ACTIVE_BALANCE_THRESHOLD); } Loading Loading @@ -10814,7 +10866,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) rq_unpin_lock(this_rq, rf); if (this_rq->avg_idle < sysctl_sched_migration_cost || !this_rq->rd->overload) { !READ_ONCE(this_rq->rd->overload)) { rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); if (sd) Loading
kernel/sched/sched.h +8 −4 Original line number Diff line number Diff line Loading @@ -695,8 +695,12 @@ struct root_domain { cpumask_var_t span; cpumask_var_t online; /* Indicate more than one runnable task for any CPU */ bool overload; /* * Indicate pullable load on at least one CPU, e.g: * - More than one runnable task * - Running task is misfit */ int overload; /* * The bit corresponding to a CPU gets set here if such CPU has more Loading Loading @@ -1761,8 +1765,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (prev_nr < 2 && rq->nr_running >= 2) { #ifdef CONFIG_SMP if (!rq->rd->overload) rq->rd->overload = true; if (!READ_ONCE(rq->rd->overload)) WRITE_ONCE(rq->rd->overload, 1); #endif } Loading
kernel/sched/topology.c +29 −4 Original line number Diff line number Diff line Loading @@ -915,6 +915,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg)); sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; return sg; } Loading Loading @@ -1290,7 +1291,7 @@ sd_init(struct sched_domain_topology_level *tl, | 0*SD_SHARE_CPUCAPACITY | 0*SD_SHARE_PKG_RESOURCES | 0*SD_SERIALIZE | 0*SD_PREFER_SIBLING | 1*SD_PREFER_SIBLING | 0*SD_NUMA | sd_flags , Loading @@ -1309,6 +1310,26 @@ sd_init(struct sched_domain_topology_level *tl, cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); sd_id = cpumask_first(sched_domain_span(sd)); /* * Check if cpu_map eclipses cpu capacity asymmetry. */ if (sd->flags & SD_ASYM_CPUCAPACITY) { long capacity = arch_scale_cpu_capacity(NULL, sd_id); bool disable = true; int i; for_each_cpu(i, sched_domain_span(sd)) { if (capacity != arch_scale_cpu_capacity(NULL, i)) { disable = false; break; } } if (disable) sd->flags &= ~SD_ASYM_CPUCAPACITY; } /* * Convert topological properties into behaviour. */ Loading @@ -1316,12 +1337,17 @@ sd_init(struct sched_domain_topology_level *tl, if (sd->flags & SD_ASYM_CPUCAPACITY) { struct sched_domain *t = sd; /* * Don't attempt to spread across cpus of different capacities. */ if (sd->child) sd->child->flags &= ~SD_PREFER_SIBLING; for_each_lower_domain(t) t->flags |= SD_BALANCE_WAKE; } if (sd->flags & SD_SHARE_CPUCAPACITY) { sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ Loading @@ -1336,6 +1362,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->busy_idx = 3; sd->idle_idx = 2; sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { sd->flags &= ~(SD_BALANCE_EXEC | Loading @@ -1345,7 +1372,6 @@ sd_init(struct sched_domain_topology_level *tl, #endif } else { sd->flags |= SD_PREFER_SIBLING; sd->cache_nice_tries = 1; sd->busy_idx = 2; sd->idle_idx = 1; Loading Loading @@ -2102,4 +2128,3 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], mutex_unlock(&sched_domains_mutex); }