Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 8be67373 authored by Greg Kroah-Hartman's avatar Greg Kroah-Hartman
Browse files

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Ingo writes:
  "scheduler fixes:

   These fixes address a rather involved performance regression between
   v4.17->v4.19 in the sched/numa auto-balancing code. Since distros
   really need this fix we accelerated it to sched/urgent for a faster
   upstream merge.

   NUMA scheduling and balancing performance is now largely back to
   v4.17 levels, without reintroducing the NUMA placement bugs that
   v4.18 and v4.19 fixed.

   Many thanks to Srikar Dronamraju, Mel Gorman and Jirka Hladky, for
   reporting, testing, re-testing and solving this rather complex set of
   bugs."

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/numa: Migrate pages to local nodes quicker early in the lifetime of a task
  mm, sched/numa: Remove rate-limiting of automatic NUMA balancing migration
  sched/numa: Avoid task migration for small NUMA improvement
  mm/migrate: Use spin_trylock() while resetting rate limit
  sched/numa: Limit the conditions where scan period is reset
  sched/numa: Reset scan rate whenever task moves across nodes
  sched/numa: Pass destination CPU as a parameter to migrate_task_rq
  sched/numa: Stop multiple tasks from moving to the CPU at the same time
parents 1df377db 37355bdc
Loading
Loading
Loading
Loading
+0 −6
Original line number Original line Diff line number Diff line
@@ -671,12 +671,6 @@ typedef struct pglist_data {
#ifdef CONFIG_NUMA_BALANCING
#ifdef CONFIG_NUMA_BALANCING
	/* Lock serializing the migrate rate limiting window */
	/* Lock serializing the migrate rate limiting window */
	spinlock_t numabalancing_migrate_lock;
	spinlock_t numabalancing_migrate_lock;

	/* Rate limiting time interval */
	unsigned long numabalancing_migrate_next_window;

	/* Number of pages migrated during the rate limiting time interval */
	unsigned long numabalancing_migrate_nr_pages;
#endif
#endif
	/*
	/*
	 * This is a per-node reserve of pages that are not available
	 * This is a per-node reserve of pages that are not available
+0 −27
Original line number Original line Diff line number Diff line
@@ -70,33 +70,6 @@ TRACE_EVENT(mm_migrate_pages,
		__print_symbolic(__entry->mode, MIGRATE_MODE),
		__print_symbolic(__entry->mode, MIGRATE_MODE),
		__print_symbolic(__entry->reason, MIGRATE_REASON))
		__print_symbolic(__entry->reason, MIGRATE_REASON))
);
);

TRACE_EVENT(mm_numa_migrate_ratelimit,

	TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages),

	TP_ARGS(p, dst_nid, nr_pages),

	TP_STRUCT__entry(
		__array(	char,		comm,	TASK_COMM_LEN)
		__field(	pid_t,		pid)
		__field(	int,		dst_nid)
		__field(	unsigned long,	nr_pages)
	),

	TP_fast_assign(
		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
		__entry->pid		= p->pid;
		__entry->dst_nid	= dst_nid;
		__entry->nr_pages	= nr_pages;
	),

	TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu",
		__entry->comm,
		__entry->pid,
		__entry->dst_nid,
		__entry->nr_pages)
);
#endif /* _TRACE_MIGRATE_H */
#endif /* _TRACE_MIGRATE_H */


/* This part must be outside protection */
/* This part must be outside protection */
+1 −1
Original line number Original line Diff line number Diff line
@@ -1167,7 +1167,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)


	if (task_cpu(p) != new_cpu) {
	if (task_cpu(p) != new_cpu) {
		if (p->sched_class->migrate_task_rq)
		if (p->sched_class->migrate_task_rq)
			p->sched_class->migrate_task_rq(p);
			p->sched_class->migrate_task_rq(p, new_cpu);
		p->se.nr_migrations++;
		p->se.nr_migrations++;
		rseq_migrate(p);
		rseq_migrate(p);
		perf_event_task_migrate(p);
		perf_event_task_migrate(p);
+1 −1
Original line number Original line Diff line number Diff line
@@ -1607,7 +1607,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
	return cpu;
	return cpu;
}
}


static void migrate_task_rq_dl(struct task_struct *p)
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
{
	struct rq *rq;
	struct rq *rq;


+91 −13
Original line number Original line Diff line number Diff line
@@ -1392,6 +1392,17 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
	int last_cpupid, this_cpupid;
	int last_cpupid, this_cpupid;


	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);

	/*
	 * Allow first faults or private faults to migrate immediately early in
	 * the lifetime of a task. The magic number 4 is based on waiting for
	 * two full passes of the "multi-stage node selection" test that is
	 * executed below.
	 */
	if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
	    (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
		return true;


	/*
	/*
	 * Multi-stage node selection is used in conjunction with a periodic
	 * Multi-stage node selection is used in conjunction with a periodic
@@ -1410,7 +1421,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
	 * This quadric squishes small probabilities, making it less likely we
	 * This quadric squishes small probabilities, making it less likely we
	 * act on an unlikely task<->page relation.
	 * act on an unlikely task<->page relation.
	 */
	 */
	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
	if (!cpupid_pid_unset(last_cpupid) &&
	if (!cpupid_pid_unset(last_cpupid) &&
				cpupid_to_nid(last_cpupid) != dst_nid)
				cpupid_to_nid(last_cpupid) != dst_nid)
		return false;
		return false;
@@ -1514,6 +1524,21 @@ struct task_numa_env {
static void task_numa_assign(struct task_numa_env *env,
static void task_numa_assign(struct task_numa_env *env,
			     struct task_struct *p, long imp)
			     struct task_struct *p, long imp)
{
{
	struct rq *rq = cpu_rq(env->dst_cpu);

	/* Bail out if run-queue part of active NUMA balance. */
	if (xchg(&rq->numa_migrate_on, 1))
		return;

	/*
	 * Clear previous best_cpu/rq numa-migrate flag, since task now
	 * found a better CPU to move/swap.
	 */
	if (env->best_cpu != -1) {
		rq = cpu_rq(env->best_cpu);
		WRITE_ONCE(rq->numa_migrate_on, 0);
	}

	if (env->best_task)
	if (env->best_task)
		put_task_struct(env->best_task);
		put_task_struct(env->best_task);
	if (p)
	if (p)
@@ -1552,6 +1577,13 @@ static bool load_too_imbalanced(long src_load, long dst_load,
	return (imb > old_imb);
	return (imb > old_imb);
}
}


/*
 * Maximum NUMA importance can be 1998 (2*999);
 * SMALLIMP @ 30 would be close to 1998/64.
 * Used to deter task migration.
 */
#define SMALLIMP	30

/*
/*
 * This checks if the overall compute and NUMA accesses of the system would
 * This checks if the overall compute and NUMA accesses of the system would
 * be improved if the source tasks was migrated to the target dst_cpu taking
 * be improved if the source tasks was migrated to the target dst_cpu taking
@@ -1569,6 +1601,9 @@ static void task_numa_compare(struct task_numa_env *env,
	long moveimp = imp;
	long moveimp = imp;
	int dist = env->dist;
	int dist = env->dist;


	if (READ_ONCE(dst_rq->numa_migrate_on))
		return;

	rcu_read_lock();
	rcu_read_lock();
	cur = task_rcu_dereference(&dst_rq->curr);
	cur = task_rcu_dereference(&dst_rq->curr);
	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
	if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
@@ -1582,7 +1617,7 @@ static void task_numa_compare(struct task_numa_env *env,
		goto unlock;
		goto unlock;


	if (!cur) {
	if (!cur) {
		if (maymove || imp > env->best_imp)
		if (maymove && moveimp >= env->best_imp)
			goto assign;
			goto assign;
		else
		else
			goto unlock;
			goto unlock;
@@ -1625,15 +1660,21 @@ static void task_numa_compare(struct task_numa_env *env,
			       task_weight(cur, env->dst_nid, dist);
			       task_weight(cur, env->dst_nid, dist);
	}
	}


	if (imp <= env->best_imp)
		goto unlock;

	if (maymove && moveimp > imp && moveimp > env->best_imp) {
	if (maymove && moveimp > imp && moveimp > env->best_imp) {
		imp = moveimp - 1;
		imp = moveimp;
		cur = NULL;
		cur = NULL;
		goto assign;
		goto assign;
	}
	}


	/*
	 * If the NUMA importance is less than SMALLIMP,
	 * task migration might only result in ping pong
	 * of tasks and also hurt performance due to cache
	 * misses.
	 */
	if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
		goto unlock;

	/*
	/*
	 * In the overloaded case, try and keep the load balanced.
	 * In the overloaded case, try and keep the load balanced.
	 */
	 */
@@ -1710,6 +1751,7 @@ static int task_numa_migrate(struct task_struct *p)
		.best_cpu = -1,
		.best_cpu = -1,
	};
	};
	struct sched_domain *sd;
	struct sched_domain *sd;
	struct rq *best_rq;
	unsigned long taskweight, groupweight;
	unsigned long taskweight, groupweight;
	int nid, ret, dist;
	int nid, ret, dist;
	long taskimp, groupimp;
	long taskimp, groupimp;
@@ -1805,20 +1847,17 @@ static int task_numa_migrate(struct task_struct *p)
	if (env.best_cpu == -1)
	if (env.best_cpu == -1)
		return -EAGAIN;
		return -EAGAIN;


	/*
	best_rq = cpu_rq(env.best_cpu);
	 * Reset the scan period if the task is being rescheduled on an
	 * alternative node to recheck if the tasks is now properly placed.
	 */
	p->numa_scan_period = task_scan_start(p);

	if (env.best_task == NULL) {
	if (env.best_task == NULL) {
		ret = migrate_task_to(p, env.best_cpu);
		ret = migrate_task_to(p, env.best_cpu);
		WRITE_ONCE(best_rq->numa_migrate_on, 0);
		if (ret != 0)
		if (ret != 0)
			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
		return ret;
		return ret;
	}
	}


	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
	ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
	WRITE_ONCE(best_rq->numa_migrate_on, 0);


	if (ret != 0)
	if (ret != 0)
		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
@@ -2596,6 +2635,39 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
	}
	}
}
}


static void update_scan_period(struct task_struct *p, int new_cpu)
{
	int src_nid = cpu_to_node(task_cpu(p));
	int dst_nid = cpu_to_node(new_cpu);

	if (!static_branch_likely(&sched_numa_balancing))
		return;

	if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
		return;

	if (src_nid == dst_nid)
		return;

	/*
	 * Allow resets if faults have been trapped before one scan
	 * has completed. This is most likely due to a new task that
	 * is pulled cross-node due to wakeups or load balancing.
	 */
	if (p->numa_scan_seq) {
		/*
		 * Avoid scan adjustments if moving to the preferred
		 * node or if the task was not previously running on
		 * the preferred node.
		 */
		if (dst_nid == p->numa_preferred_nid ||
		    (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
			return;
	}

	p->numa_scan_period = task_scan_start(p);
}

#else
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
{
@@ -2609,6 +2681,10 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
{
}
}


static inline void update_scan_period(struct task_struct *p, int new_cpu)
{
}

#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_NUMA_BALANCING */


static void
static void
@@ -6275,7 +6351,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
 * cfs_rq_of(p) references at time of call are still valid and identify the
 * cfs_rq_of(p) references at time of call are still valid and identify the
 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
 */
 */
static void migrate_task_rq_fair(struct task_struct *p)
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
{
	/*
	/*
	 * As blocked tasks retain absolute vruntime the migration needs to
	 * As blocked tasks retain absolute vruntime the migration needs to
@@ -6328,6 +6404,8 @@ static void migrate_task_rq_fair(struct task_struct *p)


	/* We have migrated, no longer consider this task hot */
	/* We have migrated, no longer consider this task hot */
	p->se.exec_start = 0;
	p->se.exec_start = 0;

	update_scan_period(p, new_cpu);
}
}


static void task_dead_fair(struct task_struct *p)
static void task_dead_fair(struct task_struct *p)
Loading