Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 92d7dde0 authored by Olav Haugan's avatar Olav Haugan Committed by Satya Durga Srinivasu Prabhala
Browse files

sched: add cpu isolation support



This adds cpu isolation APIs to the scheduler to isolate and unisolate
CPUs. Isolating and unisolating a CPU can be used in place of hotplug.
Isolating and unisolating a CPU is faster than hotplug and can thus be
used to optimize the performance and power of multi-core CPUs.

Isolating works by migrating non-pinned IRQs and tasks to other CPUS and
marking the CPU as not available to the scheduler and load balancer.
Pinned tasks and IRQs are still allowed to run but it is expected that
this would be minimal.

Unisolation works by just marking the CPU available for scheduler and
load balancer.

Change-Id: I710fe8e7541357672f4003e78c1839db40a59f1b
Signed-off-by: default avatarOlav Haugan <ohaugan@codeaurora.org>
[rameezmustafa@codeaurora.org: Port to msm-4.9]
Signed-off-by: default avatarSyed Rameez Mustafa <rameezmustafa@codeaurora.org>
[markivx: Forward port to 4.14, account for refactor, lack of HMP etc.]
Signed-off-by: default avatarVikram Mulukutla <markivx@codeaurora.org>
[satyap@codeaurora.org: Port to 4.19 and fix merge conflicts]
Signed-off-by: default avatarSatya Durga Srinivasu Prabhala <satyap@codeaurora.org>
parent 267538c2
Loading
Loading
Loading
Loading
+35 −0
Original line number Diff line number Diff line
@@ -221,6 +221,41 @@ enum migrate_types {
	RQ_TO_GROUP,
};

#ifdef CONFIG_HOTPLUG_CPU
extern int sched_isolate_count(const cpumask_t *mask, bool include_offline);
extern int sched_isolate_cpu(int cpu);
extern int sched_unisolate_cpu(int cpu);
extern int sched_unisolate_cpu_unlocked(int cpu);
#else
static inline int sched_isolate_count(const cpumask_t *mask,
				      bool include_offline)
{
	cpumask_t count_mask;

	if (include_offline)
		cpumask_andnot(&count_mask, mask, cpu_online_mask);
	else
		return 0;

	return cpumask_weight(&count_mask);
}

static inline int sched_isolate_cpu(int cpu)
{
	return 0;
}

static inline int sched_unisolate_cpu(int cpu)
{
	return 0;
}

static inline int sched_unisolate_cpu_unlocked(int cpu)
{
	return 0;
}
#endif

extern void scheduler_tick(void);

#define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
+240 −13
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@
#include <linux/nospec.h>

#include <linux/kcov.h>
#include <linux/irq.h>

#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -1064,6 +1065,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
	struct rq_flags rf;
	struct rq *rq;
	int ret = 0;
	cpumask_t allowed_mask;

	rq = task_rq_lock(p, &rf);
	update_rq_clock(rq);
@@ -1105,7 +1107,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
	}

	/* Can the task run on the task's current CPU? If so, we're done */
	if (cpumask_test_cpu(task_cpu(p), new_mask))
	if (cpumask_test_cpu(task_cpu(p), &allowed_mask))
		goto out;

	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
@@ -1461,12 +1463,13 @@ EXPORT_SYMBOL_GPL(kick_process);
 * select_task_rq() below may allow selection of !active CPUs in order
 * to satisfy the above rules.
 */
static int select_fallback_rq(int cpu, struct task_struct *p)
static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
{
	int nid = cpu_to_node(cpu);
	const struct cpumask *nodemask = NULL;
	enum { cpuset, possible, fail } state = cpuset;
	int dest_cpu;
	int isolated_candidate = -1;

	/*
	 * If the node that the CPU is on has been offlined, cpu_to_node()
@@ -1480,6 +1483,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
		for_each_cpu(dest_cpu, nodemask) {
			if (!cpu_active(dest_cpu))
				continue;
			if (cpu_isolated(dest_cpu))
				continue;
			if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
				return dest_cpu;
		}
@@ -1490,7 +1495,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
		for_each_cpu(dest_cpu, &p->cpus_allowed) {
			if (!is_cpu_allowed(p, dest_cpu))
				continue;
			if (cpu_isolated(dest_cpu)) {
				if (allow_iso)
					isolated_candidate = dest_cpu;
				continue;
			}
			goto out;
		}

		if (isolated_candidate != -1) {
			dest_cpu = isolated_candidate;
			goto out;
		}

@@ -1537,6 +1551,8 @@ static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
		   int sibling_count_hint)
{
	bool allow_isolated = (p->flags & PF_KTHREAD);

	lockdep_assert_held(&p->pi_lock);

	if (p->nr_cpus_allowed > 1)
@@ -1555,8 +1571,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
	 * [ this allows ->select_task() to simply return task_cpu(p) and
	 *   not worry about this generic constraint ]
	 */
	if (unlikely(!is_cpu_allowed(p, cpu)))
		cpu = select_fallback_rq(task_cpu(p), p);
	if (unlikely(!is_cpu_allowed(p, cpu)) ||
			(cpu_isolated(cpu) && !allow_isolated))
		cpu = select_fallback_rq(task_cpu(p), p, allow_isolated);

	return cpu;
}
@@ -3030,7 +3047,7 @@ void sched_exec(void)
	if (dest_cpu == smp_processor_id())
		goto unlock;

	if (likely(cpu_active(dest_cpu))) {
	if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) {
		struct migration_arg arg = { p, dest_cpu };

		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -5681,19 +5698,25 @@ static struct task_struct fake_task = {
};

/*
 * Migrate all tasks from the rq, sleeping tasks will be migrated by
 * try_to_wake_up()->select_task_rq().
 * Migrate all tasks (not pinned if pinned argument say so) from the rq,
 * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq().
 *
 * Called with rq->lock held even though we'er in stop_machine() and
 * there's no concurrency possible, we hold the required locks anyway
 * because of lock validation efforts.
 */
static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf,
			  bool migrate_pinned_tasks)
{
	struct rq *rq = dead_rq;
	struct task_struct *next, *stop = rq->stop;
	struct rq_flags orf = *rf;
	struct pin_cookie cookie;
	int dest_cpu;
	unsigned int num_pinned_kthreads = 1; /* this thread */
	cpumask_t avail_cpus;

	cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);

	/*
	 * Fudge the rq selection such that the below task selection loop
@@ -5715,10 +5738,12 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)

	for (;;) {
		/*
		 * There's this thread running, bail when that's the only
		 * remaining thread:
		 * There's this thread running + pinned threads, bail when
		 * those are the only remaining threads:
		 */
		if (rq->nr_running == 1)
		if ((migrate_pinned_tasks && rq->nr_running == 1) ||
		   (!migrate_pinned_tasks &&
		    rq->nr_running == num_pinned_kthreads))
			break;

		/*
@@ -5728,6 +5753,13 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
		BUG_ON(!next);
		put_prev_task(rq, next);

		if (!migrate_pinned_tasks && next->flags & PF_KTHREAD &&
			!cpumask_intersects(&avail_cpus, &next->cpus_allowed)) {
			lockdep_unpin_lock(&rq->lock, cookie);
			num_pinned_kthreads += 1;
			continue;
		}

		/*
		 * Rules for changing task_struct::cpus_allowed are holding
		 * both pi_lock and rq->lock, such that holding either
@@ -5752,7 +5784,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
		}

		/* Find suitable destination for @next, with force if needed. */
		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
		dest_cpu = select_fallback_rq(dead_rq->cpu, next, false);
		rq = __migrate_task(rq, rf, next, dest_cpu);
		if (rq != dead_rq) {
			rq_unlock(rq, rf);
@@ -5765,6 +5797,201 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)

	rq->stop = stop;
}

void set_rq_online(struct rq *rq);
void set_rq_offline(struct rq *rq);

int do_isolation_work_cpu_stop(void *data)
{
	unsigned long flags;
	unsigned int cpu = smp_processor_id();
	struct rq *rq = cpu_rq(cpu);
	struct rq_flags rf;

	watchdog_disable(cpu);

	irq_migrate_all_off_this_cpu();

	sched_ttwu_pending();
	/* Update our root-domain */
	raw_spin_lock_irqsave(&rq->lock, flags);

	if (rq->rd) {
		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
		set_rq_offline(rq);
	}

	migrate_tasks(rq, &rf, false);
	raw_spin_unlock_irqrestore(&rq->lock, flags);

	return 0;
}

int do_unisolation_work_cpu_stop(void *data)
{
	watchdog_enable(smp_processor_id());
	return 0;
}

static void sched_update_group_capacities(int cpu)
{
	struct sched_domain *sd;

	mutex_lock(&sched_domains_mutex);
	rcu_read_lock();

	for_each_domain(cpu, sd) {
		int balance_cpu = group_balance_cpu(sd->groups);

		init_sched_groups_capacity(cpu, sd);
		/*
		 * Need to ensure this is also called with balancing
		 * cpu.
		 */
		if (cpu != balance_cpu)
			init_sched_groups_capacity(balance_cpu, sd);
	}

	rcu_read_unlock();
	mutex_unlock(&sched_domains_mutex);
}

static unsigned int cpu_isolation_vote[NR_CPUS];

int sched_isolate_count(const cpumask_t *mask, bool include_offline)
{
	cpumask_t count_mask = CPU_MASK_NONE;

	if (include_offline) {
		cpumask_complement(&count_mask, cpu_online_mask);
		cpumask_or(&count_mask, &count_mask, cpu_isolated_mask);
		cpumask_and(&count_mask, &count_mask, mask);
	} else {
		cpumask_and(&count_mask, mask, cpu_isolated_mask);
	}

	return cpumask_weight(&count_mask);
}

/*
 * 1) CPU is isolated and cpu is offlined:
 *	Unisolate the core.
 * 2) CPU is not isolated and CPU is offlined:
 *	No action taken.
 * 3) CPU is offline and request to isolate
 *	Request ignored.
 * 4) CPU is offline and isolated:
 *	Not a possible state.
 * 5) CPU is online and request to isolate
 *	Normal case: Isolate the CPU
 * 6) CPU is not isolated and comes back online
 *	Nothing to do
 *
 * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
 * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
 * Client is also responsible for unisolating when a core goes offline
 * (after CPU is marked offline).
 */
int sched_isolate_cpu(int cpu)
{
	struct rq *rq = cpu_rq(cpu);
	cpumask_t avail_cpus;
	int ret_code = 0;

	lock_device_hotplug();

	cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);

	/* We cannot isolate ALL cpus in the system */
	if (cpumask_weight(&avail_cpus) == 1) {
		ret_code = -EINVAL;
		goto out;
	}

	if (!cpu_online(cpu)) {
		ret_code = -EINVAL;
		goto out;
	}

	if (++cpu_isolation_vote[cpu] > 1)
		goto out;

	set_cpu_isolated(cpu, true);
	cpumask_clear_cpu(cpu, &avail_cpus);

	/* Migrate timers */
	smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
	smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);

	stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);

	calc_load_migrate(rq);
	update_max_interval();
	sched_update_group_capacities(cpu);

out:
	unlock_device_hotplug();
	return ret_code;
}

/*
 * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
 * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
 * Client is also responsible for unisolating when a core goes offline
 * (after CPU is marked offline).
 */
int sched_unisolate_cpu_unlocked(int cpu)
{
	int ret_code = 0;
	struct rq *rq = cpu_rq(cpu);

	lock_device_hotplug_assert();

	if (!cpu_isolation_vote[cpu]) {
		ret_code = -EINVAL;
		goto out;
	}

	if (--cpu_isolation_vote[cpu])
		goto out;

	if (cpu_online(cpu)) {
		unsigned long flags;

		raw_spin_lock_irqsave(&rq->lock, flags);
		if (rq->rd) {
			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
			set_rq_online(rq);
		}
		raw_spin_unlock_irqrestore(&rq->lock, flags);
	}

	set_cpu_isolated(cpu, false);
	update_max_interval();
	sched_update_group_capacities(cpu);

	if (cpu_online(cpu)) {
		stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0);

		/* Kick CPU to immediately do load balancing */
		if (!atomic_fetch_or(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
			smp_send_reschedule(cpu);
	}

out:
	return ret_code;
}

int sched_unisolate_cpu(int cpu)
{
	int ret_code;

	lock_device_hotplug();
	ret_code = sched_unisolate_cpu_unlocked(cpu);
	unlock_device_hotplug();
	return ret_code;
}

#endif /* CONFIG_HOTPLUG_CPU */

void set_rq_online(struct rq *rq)
@@ -5955,7 +6182,7 @@ int sched_cpu_dying(unsigned int cpu)
		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
		set_rq_offline(rq);
	}
	migrate_tasks(rq, &rf);
	migrate_tasks(rq, &rf, true);
	BUG_ON(rq->nr_running != 1);
	rq_unlock_irqrestore(rq, &rf);

+46 −14
Original line number Diff line number Diff line
@@ -8553,6 +8553,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
			struct sched_group_capacity *sgc;
			struct rq *rq = cpu_rq(cpu);

			if (cpumask_test_cpu(cpu, cpu_isolated_mask))
				continue;
			/*
			 * build_sched_domains() -> init_sched_groups_capacity()
			 * gets here before we've attached the domains to the
@@ -8583,10 +8585,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
		group = child->groups;
		do {
			struct sched_group_capacity *sgc = group->sgc;
			cpumask_t *cpus = sched_group_span(group);

			if (!cpu_isolated(cpumask_first(cpus))) {
				capacity += sgc->capacity;
			min_capacity = min(sgc->min_capacity, min_capacity);
			max_capacity = max(sgc->max_capacity, max_capacity);
				min_capacity = min(sgc->min_capacity,
							min_capacity);
				max_capacity = min(sgc->max_capacity,
							max_capacity);
			}
			group = group->next;
		} while (group != child->groups);
	}
@@ -8770,6 +8777,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
		struct rq *rq = cpu_rq(i);

		if (cpu_isolated(i))
			continue;

		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
			env->flags |= LBF_NOHZ_AGAIN;

@@ -8808,12 +8818,18 @@ static inline void update_sg_lb_stats(struct lb_env *env,
		}
	}

	/* Isolated CPU has no weight */
	if (!group->group_weight) {
		sgs->group_capacity = 0;
		sgs->avg_load = 0;
		sgs->group_no_capacity = 1;
		sgs->group_type = group_other;
		sgs->group_weight = group->group_weight;
	} else {
		/* Adjust by relative CPU capacity of the group */
		sgs->group_capacity = group->sgc->capacity;
	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;

	if (sgs->sum_nr_running)
		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
		sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
							sgs->group_capacity;

		sgs->group_weight = group->group_weight;

@@ -8821,6 +8837,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
		sgs->group_type = group_classify(group, sgs);
	}

	if (sgs->sum_nr_running)
		sgs->load_per_task = sgs->sum_weighted_load /
						sgs->sum_nr_running;
}

/**
 * update_sd_pick_busiest - return 1 on busiest group
 * @env: The load balancing environment.
@@ -9959,7 +9980,13 @@ static DEFINE_SPINLOCK(balancing);
 */
void update_max_interval(void)
{
	max_load_balance_interval = HZ*num_online_cpus()/10;
	cpumask_t avail_mask;
	unsigned int available_cpus;

	cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
	available_cpus = cpumask_weight(&avail_mask);

	max_load_balance_interval = HZ*available_cpus/10;
}

/*
@@ -10511,6 +10538,9 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
	int pulled_task = 0;
	u64 curr_cost = 0;

	if (cpu_isolated(this_cpu))
		return 0;

	/*
	 * We must set idle_stamp _before_ calling idle_balance(), such that we
	 * measure the duration of idle_balance() as idle time.
@@ -10647,8 +10677,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 */
void trigger_load_balance(struct rq *rq)
{
	/* Don't need to rebalance while attached to NULL domain */
	if (unlikely(on_null_domain(rq)))
	/* Don't need to rebalance while attached to NULL domain or
	 * cpu is isolated.
	 */
	if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
		return;

	if (time_after_eq(jiffies, rq->next_balance))
+8 −3
Original line number Diff line number Diff line
@@ -262,8 +262,12 @@ static void pull_rt_task(struct rq *this_rq);

static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
	/* Try to pull RT tasks here if we lower this rq's prio */
	return rq->rt.highest_prio.curr > prev->prio;
	/*
	 * Try to pull RT tasks here if we lower this rq's prio and cpu is not
	 * isolated
	 */
	return rq->rt.highest_prio.curr > prev->prio &&
	       !cpu_isolated(cpu_of(rq));
}

static inline int rt_overloaded(struct rq *rq)
@@ -2209,7 +2213,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
	 * we may need to handle the pulling of RT tasks
	 * now.
	 */
	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running ||
		cpu_isolated(cpu_of(rq)))
		return;

	rt_queue_pull_task(rq);
+2 −0
Original line number Diff line number Diff line
@@ -166,6 +166,7 @@ extern long calc_load_fold_active(struct rq *this_rq, long adjust);

#ifdef CONFIG_SMP
extern void cpu_load_update_active(struct rq *this_rq);
extern void init_sched_groups_capacity(int cpu, struct sched_domain *sd);
#else
static inline void cpu_load_update_active(struct rq *this_rq) { }
#endif
@@ -1749,6 +1750,7 @@ extern const struct sched_class idle_sched_class;
extern void update_group_capacity(struct sched_domain *sd, int cpu);

extern void trigger_load_balance(struct rq *rq);
extern void nohz_balance_clear_nohz_mask(int cpu);

extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);

Loading